fasta_loader.php 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. <?php
  2. function tripal_feature_fasta_load_form (){
  3. $form['fasta_file']= array(
  4. '#type' => 'textfield',
  5. '#title' => t('FASTA File'),
  6. '#description' => t('Please enter the full system path for the FASTA file, or a path within the Drupal
  7. installation (e.g. /sites/default/files/xyz.obo). The path must be accessible to the
  8. server on which this Drupal instance is running.'),
  9. '#required' => TRUE,
  10. '#weight' => 1
  11. );
  12. // get the list of organisms
  13. $sql = "SELECT * FROM {organism} ORDER BY genus, species";
  14. $previous_db = tripal_db_set_active('chado'); // use chado database
  15. $org_rset = db_query($sql);
  16. tripal_db_set_active($previous_db); // now use drupal database
  17. $organisms = array();
  18. $organisms[''] = '';
  19. while($organism = db_fetch_object($org_rset)){
  20. $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
  21. }
  22. $form['organism_id'] = array (
  23. '#title' => t('Organism'),
  24. '#type' => t('select'),
  25. '#description' => t("Choose the organism to which these sequences are associated "),
  26. '#required' => TRUE,
  27. '#options' => $organisms,
  28. '#weight' => 2,
  29. );
  30. $form['type']= array(
  31. '#type' => 'textfield',
  32. '#title' => t('Sequence Type'),
  33. '#required' => TRUE,
  34. '#description' => t('Please enter the Sequence Ontology term that describes the sequences in the FASTA file.'),
  35. '#weight' => 3
  36. );
  37. // get the list of organisms
  38. $sql = "SELECT L.library_id, L.name, CVT.name as type
  39. FROM {library} L
  40. INNER JOIN {cvterm} CVT ON L.type_id = CVT.cvterm_id
  41. ORDER BY name";
  42. $previous_db = tripal_db_set_active('chado'); // use chado database
  43. $lib_rset = db_query($sql);
  44. tripal_db_set_active($previous_db); // now use drupal database
  45. $libraries = array();
  46. $libraries[''] = '';
  47. while($library = db_fetch_object($lib_rset)){
  48. $libraries[$library->library_id] = "$library->name ($library->type)";
  49. }
  50. // $form['library_id'] = array (
  51. // '#title' => t('Library'),
  52. // '#type' => t('select'),
  53. // '#description' => t("Choose the library to which these sequences are associated "),
  54. // '#required' => FALSE,
  55. // '#options' => $libraries,
  56. // '#weight' => 5,
  57. // );
  58. $form['update']= array(
  59. '#type' => 'checkbox',
  60. '#title' => t('Insert and update'),
  61. '#required' => FALSE,
  62. '#description' => t('By default only new features are inserted. Select this checkbox to update
  63. features that already exists with the contents from the FASTA file.'),
  64. '#weight' => 6
  65. );
  66. // Advanced Options
  67. $form['advanced'] = array(
  68. '#type' => 'fieldset',
  69. '#title' => t('Advanced Options'),
  70. '#weight'=> 7,
  71. '#collapsed' => TRUE
  72. );
  73. $form['advanced']['re_help']= array(
  74. '#type' => 'item',
  75. '#value' => t('A regular expression is an advanced method for extracting information from a string of text.
  76. By default, this loader will use the first word in the definition line for each sequence in the FASTA file
  77. as the uniquename for the sequences. If this is not desired, you may use the following regular
  78. expressions to define the postions of the unique name.'),
  79. '#weight' => 0
  80. );
  81. $form['advanced']['re_name']= array(
  82. '#type' => 'textfield',
  83. '#title' => t('Regular expression for the name'),
  84. '#required' => FALSE,
  85. '#description' => t('Enter the regular expression that will extract the feature name from the FASTA definition line. For example, for a defintion line with a name and uniquename separated by a bar \'|\' (>seqname|uniquename), the regular expression would be, "^(.*?)\|.*$"'),
  86. '#weight' => 1
  87. );
  88. $form['advanced']['re_uname']= array(
  89. '#type' => 'textfield',
  90. '#title' => t('Regular expression for the unique name'),
  91. '#required' => FALSE,
  92. '#description' => t('Enter the regular expression that will extract the unique feature name for each feature from the FASTA definition line. This name must be unique for the organism.'),
  93. '#weight' => 2
  94. );
  95. // Advanced database cross-reference optoins
  96. $form['advanced']['db'] = array(
  97. '#type' => 'fieldset',
  98. '#title' => t('External Database Reference'),
  99. '#weight'=> 6,
  100. '#collapsed' => TRUE
  101. );
  102. $form['advanced']['db']['re_accession']= array(
  103. '#type' => 'textfield',
  104. '#title' => t('Regular expression for the accession'),
  105. '#required' => FALSE,
  106. '#description' => t('Enter the regular expression that will extract the accession for the external database for each feature from the FASTA definition line.'),
  107. '#weight' => 2
  108. );
  109. // get the list of databases
  110. $sql = "SELECT * FROM {db} ORDER BY name";
  111. $previous_db = tripal_db_set_active('chado'); // use chado database
  112. $db_rset = db_query($sql);
  113. tripal_db_set_active($previous_db); // now use drupal database
  114. $dbs = array();
  115. $dbs[''] = '';
  116. while($db = db_fetch_object($db_rset)){
  117. $dbs[$db->db_id] = "$db->name";
  118. }
  119. $form['advanced']['db']['db_id'] = array (
  120. '#title' => t('External Database'),
  121. '#type' => t('select'),
  122. '#description' => t("Plese choose an external database for which these sequences have a cross reference."),
  123. '#required' => FALSE,
  124. '#options' => $dbs,
  125. '#weight' => 1,
  126. );
  127. $form['advanced']['relationship'] = array(
  128. '#type' => 'fieldset',
  129. '#title' => t('Relationships'),
  130. '#weight'=> 6,
  131. '#collapsed' => TRUE
  132. );
  133. $rels = array();
  134. $rels[''] = '';
  135. $rels['part_of'] = 'part of';
  136. $rels['derives_from'] = 'produced by';
  137. // Advanced references options
  138. $form['advanced']['relationship']['rel_type']= array(
  139. '#title' => t('Relationship Type'),
  140. '#type' => t('select'),
  141. '#description' => t("Use this option to create associations, or relationships between the
  142. features of this FASTA file and existing features in the database. For
  143. example, to associate a FASTA file of peptides to existing genes or transcript sequence,
  144. select the type 'produced by'. For a CDS sequences select the type 'part of'"),
  145. '#required' => FALSE,
  146. '#options' => $rels,
  147. '#weight' => 5,
  148. );
  149. $form['advanced']['relationship']['re_subject']= array(
  150. '#type' => 'textfield',
  151. '#title' => t('Regular expression for the parent'),
  152. '#required' => FALSE,
  153. '#description' => t('Enter the regular expression that will extract the unique
  154. name needed to identify the existing sequence for which the
  155. relationship type selected above will apply.'),
  156. '#weight' => 6
  157. );
  158. $form['advanced']['relationship']['parent_type']= array(
  159. '#type' => 'textfield',
  160. '#title' => t('Parent Type'),
  161. '#required' => FALSE,
  162. '#description' => t('Please enter the Sequence Ontology term for the parent. For example
  163. if the FASTA file being loaded is a set of proteins that are
  164. products of genes, then use the SO term \'gene\' or \'transcript\' or equivalent. However,
  165. this type must match the type for already loaded features.'),
  166. '#weight' => 7
  167. );
  168. $form['button'] = array(
  169. '#type' => 'submit',
  170. '#value' => t('Import FASTA file'),
  171. '#weight' => 10,
  172. );
  173. return $form;
  174. }
  175. /*************************************************************************
  176. *
  177. */
  178. function tripal_feature_fasta_load_form_validate($form, &$form_state){
  179. $fasta_file = trim($form_state['values']['fasta_file']);
  180. $organism_id = $form_state['values']['organism_id'];
  181. $type = trim($form_state['values']['type']);
  182. $update = trim($form_state['values']['update']);
  183. $library_id = $form_state['values']['library_id'];
  184. $re_name = trim($form_state['values']['re_name']);
  185. $re_uname = trim($form_state['values']['re_uname']);
  186. $re_accession = trim($form_state['values']['re_accession']);
  187. $db_id = $form_state['values']['db_id'];
  188. $rel_type = $form_state['values']['rel_type'];
  189. $re_subject = trim($form_state['values']['re_subject']);
  190. $parent_type = trim($form_state['values']['parent_type']);
  191. // check to see if the file is located local to Drupal
  192. $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $fasta_file;
  193. if(!file_exists($dfile)){
  194. // if not local to Drupal, the file must be someplace else, just use
  195. // the full path provided
  196. $dfile = $fasta_file;
  197. }
  198. if(!file_exists($dfile)){
  199. form_set_error('fasta_file',t("Cannot find the file on the system. Check that the file exists or that the web server has permissions to read the file."));
  200. }
  201. // make sure if a relationship is specified that all fields are provided.
  202. if(($rel_type or $parent_type) and !$re_subject){
  203. form_set_error('re_subject',t("Please provide a regular expression for the parent"));
  204. }
  205. if(($rel_type or $re_subject) and !$parent_type){
  206. form_set_error('parent_type',t("Please provide a SO term for the parent"));
  207. }
  208. if(($parent_type or $re_subject) and !$rel_type){
  209. form_set_error('rel_type',t("Please select a relationship type"));
  210. }
  211. // make sure if a database is specified that all fields are provided
  212. if($db_id and !$re_accession){
  213. form_set_error('re_accession',t("Please provide a regular expression for the accession"));
  214. }
  215. if($re_accession and !$db_id){
  216. form_set_error('db_id',t("Please select a database"));
  217. }
  218. // check to make sure the types exists
  219. $cvtermsql = "SELECT CVT.cvterm_id
  220. FROM {cvterm} CVT
  221. INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  222. LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
  223. WHERE cv.name = '%s' and (CVT.name = '%s' or CVTS.synonym = '%s')";
  224. $cvterm = db_fetch_object(db_query($cvtermsql,'sequence',$type,$type));
  225. if(!$cvterm){
  226. form_set_error('type',t("The Sequence Ontology (SO) term selected for the sequence type is not available in the database. Please check spelling or select another."));
  227. }
  228. if($rel_type){
  229. $cvterm = db_fetch_object(db_query($cvtermsql,'sequence',$parent_type,$parent_type));
  230. if(!$cvterm){
  231. form_set_error('parent_type',t("The Sequence Ontology (SO) term selected for the parent relationship is not available in the database. Please check spelling or select another."));
  232. }
  233. }
  234. // check to make sure the 'relationship' and 'sequence' ontologies are loaded
  235. $form_state['storage']['dfile'] = $dfile;
  236. }
  237. /*************************************************************************
  238. *
  239. */
  240. function tripal_feature_fasta_load_form_submit ($form, &$form_state){
  241. global $user;
  242. $dfile = $form_state['storage']['dfile'];
  243. $organism_id = $form_state['values']['organism_id'];
  244. $type = trim($form_state['values']['type']);
  245. $update = trim($form_state['values']['update']);
  246. $library_id = $form_state['values']['library_id'];
  247. $re_name = trim($form_state['values']['re_name']);
  248. $re_uname = trim($form_state['values']['re_uname']);
  249. $re_accession = trim($form_state['values']['re_accession']);
  250. $db_id = $form_state['values']['db_id'];
  251. $rel_type = $form_state['values']['rel_type'];
  252. $re_subject = trim($form_state['values']['re_subject']);
  253. $parent_type = trim($form_state['values']['parent_type']);
  254. $args = array($dfile,$organism_id,$type,$library_id,$re_name,$re_uname,
  255. $re_accession,$db_id,$rel_type,$re_subject,$parent_type,$update,$user->uid);
  256. tripal_add_job("Import FASTA file: $dfile",'tripal_core',
  257. 'tripal_feature_load_fasta',$args,$user->uid);
  258. }
  259. /*************************************************************************
  260. *
  261. */
  262. function tripal_feature_load_fasta($dfile, $organism_id, $type,
  263. $library_id, $re_name, $re_uname, $re_accession, $db_id, $rel_type,
  264. $re_subject, $parent_type, $update,$uid, $job = NULL)
  265. {
  266. print "Opening FASTA file $dfile\n";
  267. $lines = file($dfile,FILE_SKIP_EMPTY_LINES);
  268. $i = 0;
  269. $name = '';
  270. $residues = '';
  271. $num_lines = sizeof($lines);
  272. $interval = intval($num_lines * 0.01);
  273. if($interval == 0){
  274. $interval = 1;
  275. }
  276. foreach ($lines as $line_num => $line) {
  277. $i++; // update the line count
  278. // update the job status every 1% features
  279. if($job and $i % $interval == 0){
  280. tripal_job_set_progress($job,intval(($i/$num_lines)*100));
  281. }
  282. // get the name, uniquename, accession and relationship subject from
  283. // the definition line
  284. if(preg_match('/^>/',$line)){
  285. // if we have a feature name then we are starting a new sequence
  286. // and we need to insert this one
  287. if($name){
  288. tripal_feature_fasta_loader_insert_feature($name,$uname,$db_id,
  289. $accession,$subject,$rel_type,$parent_type,$library_id,$organism_id,$type,
  290. $source,$residues,$update);
  291. $residues = '';
  292. $name = '';
  293. }
  294. $line = preg_replace("/^>/",'',$line);
  295. if($re_name){
  296. if(!preg_match("/$re_name/",$line,$matches)){
  297. print "Regular expression for the feature name finds nothing\n";
  298. }
  299. $name = trim($matches[1]);
  300. } else {
  301. preg_match("/^(.*?)[\s\|].*$/",$line,$matches);
  302. $name = trim($matches[1]);
  303. }
  304. if($re_uname){
  305. preg_match("/$re_uname/",$line,$matches);
  306. $uname = trim($matches[1]);
  307. } else {
  308. preg_match("/^(.*?)[\s\|].*$/",$line,$matches);
  309. $name = trim($matches[1]);
  310. }
  311. preg_match("/$re_accession/",$line,$matches);
  312. $accession = trim($matches[1]);
  313. preg_match("/$re_subject/",$line,$matches);
  314. $subject = trim($matches[1]);
  315. // print "Name: $name, UName: $uname, Accession: $accession, Subject: $subject\n";
  316. }
  317. else {
  318. $residues .= trim($line);
  319. }
  320. }
  321. // now load the last sequence in the file
  322. tripal_feature_fasta_loader_insert_feature($name,$uname,$db_id,
  323. $accession,$subject,$rel_type,$parent_type,$library_id,$organism_id,$type,
  324. $source,$residues,$update);
  325. return '';
  326. }
  327. /*************************************************************************
  328. *
  329. */
  330. function tripal_feature_fasta_loader_insert_feature($name,$uname,$db_id,$accession,
  331. $parent,$rel_type,$parent_type,$library_id,$organism_id,$type,
  332. $source,$residues,$update)
  333. {
  334. $previous_db = tripal_db_set_active('chado');
  335. // first get the type for this sequence
  336. $cvtermsql = "SELECT CVT.cvterm_id
  337. FROM {cvterm} CVT
  338. INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  339. LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
  340. WHERE cv.name = '%s' and (CVT.name = '%s' or CVTS.synonym = '%s')";
  341. $cvterm = db_fetch_object(db_query($cvtermsql,'sequence',$type,$type));
  342. if(!$cvterm){
  343. print "ERROR: cannot find the term type: '$type'\n";
  344. return 0;
  345. }
  346. // check to see if this feature already exists
  347. $feature_sql = "SELECT * FROM {feature}
  348. WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
  349. $feature = db_fetch_object(db_query($feature_sql,$organism_id,$uname,$cvterm->cvterm_id));
  350. if(!$feature){
  351. // now insert the feature
  352. $sql = "INSERT INTO {feature} (organism_id, name, uniquename, residues, seqlen, md5checksum,type_id,is_analysis,is_obsolete)
  353. VALUES(%d,'%s','%s','%s',%d, '%s', %d, %s, %s)";
  354. $result = db_query($sql,$organism_id,$name,$uname,$residues,strlen($residues),
  355. md5($residues),$cvterm->cvterm_id,'false','false');
  356. if(!$result){
  357. print "ERROR: failed to insert feature '$name ($uname)'\n";
  358. return 0;
  359. } else {
  360. print "Inserted feature $name ($uname)\n";
  361. }
  362. } else {
  363. if($update){
  364. $sql = "UPDATE {feature}
  365. SET name = '%s', residues = '%s', seqlen = '%s', md5checksum = '%s'
  366. WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
  367. $result = db_query($sql,$name,$residues,strlen($residues),md5($residues),$organism_id,$uname,$cvterm->cvterm_id);
  368. if(!$result){
  369. print "ERROR: failed to update feature '$name ($uname)'\n";
  370. return 0;
  371. } else {
  372. print "Updated feature $name ($uname)\n";
  373. }
  374. } else {
  375. print "WARNING: feature already exists, skipping: '$name ($uname)'\n";
  376. }
  377. }
  378. // now get the feature
  379. $feature = db_fetch_object(db_query($feature_sql,$organism_id,$uname,$cvterm->cvterm_id));
  380. if(!$feature){
  381. print "Something bad has happened: $organism_id, $uname, $cvterm->cvterm_id\n";
  382. return 0;
  383. }
  384. // now add the database cross reference
  385. if($db_id){
  386. // check to see if this accession reference exists, if not add it
  387. $dbxrefsql = "SELECT * FROM {dbxref} WHERE db_id = %s and accession = '%s'";
  388. $dbxref = db_fetch_object(db_query($dbxrefsql,$db_id,$accession));
  389. if(!$dbxref){
  390. $sql = "INSERT INTO {dbxref} (db_id,accession) VALUES (%d,'%s')";
  391. $result = db_query($sql,$db_id,$accession);
  392. if(!$result){
  393. print "WARNING: could not add external database acession: '$name accession: $accession'\n";
  394. }
  395. $dbxref = db_fetch_object(db_query($dbxrefsql,$db_id,$accession));
  396. }
  397. // check to see if the feature dbxref record exists if not, then add it
  398. $fdbxrefsql = "SELECT * FROM {feature_dbxref} WHERE feature_id = %d and dbxref_id = %d";
  399. $fdbxref = db_fetch_object(db_query($fdbxrefsql,$feature->feature_id,$dbxref->dbxref_id));
  400. if(!$fdbxref){
  401. $sql = "INSERT INTO {feature_dbxref} (feature_id,dbxref_id) VALUES (%d,%d)";
  402. $result = db_query($sql,$feature->feature_id,$dbxref->dbxref_id);
  403. if(!$result){
  404. print "WARNING: could not associate database cross reference with feature: '$name accession: $accession'\n";
  405. } else {
  406. print "Added database crossreference $name ($uname) -> $accession\n";
  407. }
  408. }
  409. }
  410. // now add in the relationship if one exists. First, get the parent type for the relationship
  411. // then get the parent feature
  412. if($rel_type){
  413. $parentcvterm = db_fetch_object(db_query($cvtermsql,'sequence',$parent_type,$parent_type));
  414. $relcvterm = db_fetch_object(db_query($cvtermsql,'relationship',$rel_type,$rel_type));
  415. $parent_feature = db_fetch_object(db_query($feature_sql,$organism_id,$parent,$parentcvterm->cvterm_id));
  416. if($parent_feature){
  417. // check to see if the relationship already exists
  418. $sql = "SELECT * FROM {feature_relationship} WHERE subject_id = %d and object_id = %d and type_id = %d";
  419. $rel = db_fetch_object(db_query($sql,$feature->feature_id,$parent_feature->feature_id,$relcvterm->cvterm_id));
  420. if($rel){
  421. print "WARNING: relationship already exists, skipping '$uname' ($type) $rel_type '$parent' ($parent_type)\n";
  422. } else {
  423. $sql = "INSERT INTO {feature_relationship} (subject_id,object_id,type_id)
  424. VALUES (%d,%d,%d)";
  425. $result = db_query($sql,$feature->feature_id,$parent_feature->feature_id,$relcvterm->cvterm_id);
  426. if(!$result){
  427. print "WARNING: failed to insert feature relationship '$uname' ($type) $rel_type '$parent' ($parent_type)\n";
  428. } else {
  429. print "Inserted relationship relationship: '$uname' ($type) $rel_type '$parent' ($parent_type)\n";
  430. }
  431. }
  432. }
  433. else {
  434. print "WARNING: cannot establish relationship '$uname' ($type) $rel_type '$parent' ($parent_type): Cannot find the parent\n";
  435. }
  436. }
  437. tripal_db_set_active($previous_db);
  438. }