fasta_loader.php 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509
  1. <?php
  2. /**
  3. * @defgroup fasta_loader FASTA Feature Loader
  4. * @{
  5. * Provides fasta loading functionality. Creates features based on their specification in a fasta file.
  6. * @}
  7. * @ingroup tripal_feature
  8. */
  9. /**
  10. *
  11. *
  12. * @ingroup fasta_loader
  13. */
  14. function tripal_feature_fasta_load_form (){
  15. $form['fasta_file']= array(
  16. '#type' => 'textfield',
  17. '#title' => t('FASTA File'),
  18. '#description' => t('Please enter the full system path for the FASTA file, or a path within the Drupal
  19. installation (e.g. /sites/default/files/xyz.obo). The path must be accessible to the
  20. server on which this Drupal instance is running.'),
  21. '#required' => TRUE,
  22. '#weight' => 1
  23. );
  24. // get the list of organisms
  25. $sql = "SELECT * FROM {organism} ORDER BY genus, species";
  26. $previous_db = tripal_db_set_active('chado'); // use chado database
  27. $org_rset = db_query($sql);
  28. tripal_db_set_active($previous_db); // now use drupal database
  29. $organisms = array();
  30. $organisms[''] = '';
  31. while($organism = db_fetch_object($org_rset)){
  32. $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
  33. }
  34. $form['organism_id'] = array (
  35. '#title' => t('Organism'),
  36. '#type' => t('select'),
  37. '#description' => t("Choose the organism to which these sequences are associated "),
  38. '#required' => TRUE,
  39. '#options' => $organisms,
  40. '#weight' => 2,
  41. );
  42. $form['type']= array(
  43. '#type' => 'textfield',
  44. '#title' => t('Sequence Type'),
  45. '#required' => TRUE,
  46. '#description' => t('Please enter the Sequence Ontology term that describes the sequences in the FASTA file.'),
  47. '#weight' => 3
  48. );
  49. // get the list of organisms
  50. $sql = "SELECT L.library_id, L.name, CVT.name as type
  51. FROM {library} L
  52. INNER JOIN {cvterm} CVT ON L.type_id = CVT.cvterm_id
  53. ORDER BY name";
  54. $previous_db = tripal_db_set_active('chado'); // use chado database
  55. $lib_rset = db_query($sql);
  56. tripal_db_set_active($previous_db); // now use drupal database
  57. $libraries = array();
  58. $libraries[''] = '';
  59. while($library = db_fetch_object($lib_rset)){
  60. $libraries[$library->library_id] = "$library->name ($library->type)";
  61. }
  62. // $form['library_id'] = array (
  63. // '#title' => t('Library'),
  64. // '#type' => t('select'),
  65. // '#description' => t("Choose the library to which these sequences are associated "),
  66. // '#required' => FALSE,
  67. // '#options' => $libraries,
  68. // '#weight' => 5,
  69. // );
  70. $form['update']= array(
  71. '#type' => 'checkbox',
  72. '#title' => t('Insert and update'),
  73. '#required' => FALSE,
  74. '#description' => t('By default only new features are inserted. Select this checkbox to update
  75. features that already exists with the contents from the FASTA file.'),
  76. '#weight' => 6
  77. );
  78. // Advanced Options
  79. $form['advanced'] = array(
  80. '#type' => 'fieldset',
  81. '#title' => t('Advanced Options'),
  82. '#weight'=> 7,
  83. '#collapsed' => TRUE
  84. );
  85. $form['advanced']['re_help']= array(
  86. '#type' => 'item',
  87. '#value' => t('A regular expression is an advanced method for extracting information from a string of text.
  88. By default, this loader will use the first word in the definition line for each sequence in the FASTA file
  89. as the uniquename for the sequences. If this is not desired, you may use the following regular
  90. expressions to define the postions of the unique name.'),
  91. '#weight' => 0
  92. );
  93. $form['advanced']['re_name']= array(
  94. '#type' => 'textfield',
  95. '#title' => t('Regular expression for the name'),
  96. '#required' => FALSE,
  97. '#description' => t('Enter the regular expression that will extract the feature name from the FASTA definition line. For example, for a defintion line with a name and uniquename separated by a bar \'|\' (>seqname|uniquename), the regular expression would be, "^(.*?)\|.*$"'),
  98. '#weight' => 1
  99. );
  100. $form['advanced']['re_uname']= array(
  101. '#type' => 'textfield',
  102. '#title' => t('Regular expression for the unique name'),
  103. '#required' => FALSE,
  104. '#description' => t('Enter the regular expression that will extract the unique feature name for each feature from the FASTA definition line. This name must be unique for the organism.'),
  105. '#weight' => 2
  106. );
  107. // Advanced database cross-reference optoins
  108. $form['advanced']['db'] = array(
  109. '#type' => 'fieldset',
  110. '#title' => t('External Database Reference'),
  111. '#weight'=> 6,
  112. '#collapsed' => TRUE
  113. );
  114. $form['advanced']['db']['re_accession']= array(
  115. '#type' => 'textfield',
  116. '#title' => t('Regular expression for the accession'),
  117. '#required' => FALSE,
  118. '#description' => t('Enter the regular expression that will extract the accession for the external database for each feature from the FASTA definition line.'),
  119. '#weight' => 2
  120. );
  121. // get the list of databases
  122. $sql = "SELECT * FROM {db} ORDER BY name";
  123. $previous_db = tripal_db_set_active('chado'); // use chado database
  124. $db_rset = db_query($sql);
  125. tripal_db_set_active($previous_db); // now use drupal database
  126. $dbs = array();
  127. $dbs[''] = '';
  128. while($db = db_fetch_object($db_rset)){
  129. $dbs[$db->db_id] = "$db->name";
  130. }
  131. $form['advanced']['db']['db_id'] = array (
  132. '#title' => t('External Database'),
  133. '#type' => t('select'),
  134. '#description' => t("Plese choose an external database for which these sequences have a cross reference."),
  135. '#required' => FALSE,
  136. '#options' => $dbs,
  137. '#weight' => 1,
  138. );
  139. $form['advanced']['relationship'] = array(
  140. '#type' => 'fieldset',
  141. '#title' => t('Relationships'),
  142. '#weight'=> 6,
  143. '#collapsed' => TRUE
  144. );
  145. $rels = array();
  146. $rels[''] = '';
  147. $rels['part_of'] = 'part of';
  148. $rels['derives_from'] = 'produced by';
  149. // Advanced references options
  150. $form['advanced']['relationship']['rel_type']= array(
  151. '#title' => t('Relationship Type'),
  152. '#type' => t('select'),
  153. '#description' => t("Use this option to create associations, or relationships between the
  154. features of this FASTA file and existing features in the database. For
  155. example, to associate a FASTA file of peptides to existing genes or transcript sequence,
  156. select the type 'produced by'. For a CDS sequences select the type 'part of'"),
  157. '#required' => FALSE,
  158. '#options' => $rels,
  159. '#weight' => 5,
  160. );
  161. $form['advanced']['relationship']['re_subject']= array(
  162. '#type' => 'textfield',
  163. '#title' => t('Regular expression for the parent'),
  164. '#required' => FALSE,
  165. '#description' => t('Enter the regular expression that will extract the unique
  166. name needed to identify the existing sequence for which the
  167. relationship type selected above will apply.'),
  168. '#weight' => 6
  169. );
  170. $form['advanced']['relationship']['parent_type']= array(
  171. '#type' => 'textfield',
  172. '#title' => t('Parent Type'),
  173. '#required' => FALSE,
  174. '#description' => t('Please enter the Sequence Ontology term for the parent. For example
  175. if the FASTA file being loaded is a set of proteins that are
  176. products of genes, then use the SO term \'gene\' or \'transcript\' or equivalent. However,
  177. this type must match the type for already loaded features.'),
  178. '#weight' => 7
  179. );
  180. $form['button'] = array(
  181. '#type' => 'submit',
  182. '#value' => t('Import FASTA file'),
  183. '#weight' => 10,
  184. );
  185. return $form;
  186. }
  187. /**
  188. *
  189. *
  190. * @ingroup fasta_loader
  191. */
  192. function tripal_feature_fasta_load_form_validate($form, &$form_state){
  193. $fasta_file = trim($form_state['values']['fasta_file']);
  194. $organism_id = $form_state['values']['organism_id'];
  195. $type = trim($form_state['values']['type']);
  196. $update = trim($form_state['values']['update']);
  197. $library_id = $form_state['values']['library_id'];
  198. $re_name = trim($form_state['values']['re_name']);
  199. $re_uname = trim($form_state['values']['re_uname']);
  200. $re_accession = trim($form_state['values']['re_accession']);
  201. $db_id = $form_state['values']['db_id'];
  202. $rel_type = $form_state['values']['rel_type'];
  203. $re_subject = trim($form_state['values']['re_subject']);
  204. $parent_type = trim($form_state['values']['parent_type']);
  205. // check to see if the file is located local to Drupal
  206. $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $fasta_file;
  207. if(!file_exists($dfile)){
  208. // if not local to Drupal, the file must be someplace else, just use
  209. // the full path provided
  210. $dfile = $fasta_file;
  211. }
  212. if(!file_exists($dfile)){
  213. form_set_error('fasta_file',t("Cannot find the file on the system. Check that the file exists or that the web server has permissions to read the file."));
  214. }
  215. // make sure if a relationship is specified that all fields are provided.
  216. if(($rel_type or $parent_type) and !$re_subject){
  217. form_set_error('re_subject',t("Please provide a regular expression for the parent"));
  218. }
  219. if(($rel_type or $re_subject) and !$parent_type){
  220. form_set_error('parent_type',t("Please provide a SO term for the parent"));
  221. }
  222. if(($parent_type or $re_subject) and !$rel_type){
  223. form_set_error('rel_type',t("Please select a relationship type"));
  224. }
  225. // make sure if a database is specified that all fields are provided
  226. if($db_id and !$re_accession){
  227. form_set_error('re_accession',t("Please provide a regular expression for the accession"));
  228. }
  229. if($re_accession and !$db_id){
  230. form_set_error('db_id',t("Please select a database"));
  231. }
  232. // check to make sure the types exists
  233. $cvtermsql = "SELECT CVT.cvterm_id
  234. FROM {cvterm} CVT
  235. INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  236. LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
  237. WHERE cv.name = '%s' and (CVT.name = '%s' or CVTS.synonym = '%s')";
  238. $cvterm = db_fetch_object(db_query($cvtermsql,'sequence',$type,$type));
  239. if(!$cvterm){
  240. form_set_error('type',t("The Sequence Ontology (SO) term selected for the sequence type is not available in the database. Please check spelling or select another."));
  241. }
  242. if($rel_type){
  243. $cvterm = db_fetch_object(db_query($cvtermsql,'sequence',$parent_type,$parent_type));
  244. if(!$cvterm){
  245. form_set_error('parent_type',t("The Sequence Ontology (SO) term selected for the parent relationship is not available in the database. Please check spelling or select another."));
  246. }
  247. }
  248. // check to make sure the 'relationship' and 'sequence' ontologies are loaded
  249. $form_state['storage']['dfile'] = $dfile;
  250. }
  251. /**
  252. *
  253. *
  254. * @ingroup fasta_loader
  255. */
  256. function tripal_feature_fasta_load_form_submit ($form, &$form_state){
  257. global $user;
  258. $dfile = $form_state['storage']['dfile'];
  259. $organism_id = $form_state['values']['organism_id'];
  260. $type = trim($form_state['values']['type']);
  261. $update = trim($form_state['values']['update']);
  262. $library_id = $form_state['values']['library_id'];
  263. $re_name = trim($form_state['values']['re_name']);
  264. $re_uname = trim($form_state['values']['re_uname']);
  265. $re_accession = trim($form_state['values']['re_accession']);
  266. $db_id = $form_state['values']['db_id'];
  267. $rel_type = $form_state['values']['rel_type'];
  268. $re_subject = trim($form_state['values']['re_subject']);
  269. $parent_type = trim($form_state['values']['parent_type']);
  270. $args = array($dfile,$organism_id,$type,$library_id,$re_name,$re_uname,
  271. $re_accession,$db_id,$rel_type,$re_subject,$parent_type,$update,$user->uid);
  272. tripal_add_job("Import FASTA file: $dfile",'tripal_feature',
  273. 'tripal_feature_load_fasta',$args,$user->uid);
  274. }
  275. /**
  276. *
  277. *
  278. * @ingroup fasta_loader
  279. */
  280. function tripal_feature_load_fasta($dfile, $organism_id, $type,
  281. $library_id, $re_name, $re_uname, $re_accession, $db_id, $rel_type,
  282. $re_subject, $parent_type, $update,$uid, $job = NULL)
  283. {
  284. print "Opening FASTA file $dfile\n";
  285. $lines = file($dfile,FILE_SKIP_EMPTY_LINES);
  286. $i = 0;
  287. $name = '';
  288. $residues = '';
  289. $num_lines = sizeof($lines);
  290. $interval = intval($num_lines * 0.01);
  291. if($interval == 0){
  292. $interval = 1;
  293. }
  294. foreach ($lines as $line_num => $line) {
  295. $i++; // update the line count
  296. // update the job status every 1% features
  297. if($job and $i % $interval == 0){
  298. tripal_job_set_progress($job,intval(($i/$num_lines)*100));
  299. }
  300. // get the name, uniquename, accession and relationship subject from
  301. // the definition line
  302. if(preg_match('/^>/',$line)){
  303. // if we have a feature name then we are starting a new sequence
  304. // and we need to insert this one
  305. if($name){
  306. tripal_feature_fasta_loader_insert_feature($name,$uname,$db_id,
  307. $accession,$subject,$rel_type,$parent_type,$library_id,$organism_id,$type,
  308. $source,$residues,$update);
  309. $residues = '';
  310. $name = '';
  311. }
  312. $line = preg_replace("/^>/",'',$line);
  313. if($re_name){
  314. if(!preg_match("/$re_name/",$line,$matches)){
  315. print "Regular expression for the feature name finds nothing\n";
  316. }
  317. $name = trim($matches[1]);
  318. } else {
  319. preg_match("/^\s*(.*?)[\s\|].*$/",$line,$matches);
  320. $name = trim($matches[1]);
  321. }
  322. if($re_uname){
  323. preg_match("/$re_uname/",$line,$matches);
  324. $uname = trim($matches[1]);
  325. } else {
  326. preg_match("/^\s*(.*?)[\s\|].*$/",$line,$matches);
  327. $uname = trim($matches[1]);
  328. }
  329. preg_match("/$re_accession/",$line,$matches);
  330. $accession = trim($matches[1]);
  331. preg_match("/$re_subject/",$line,$matches);
  332. $subject = trim($matches[1]);
  333. // print "Name: $name, UName: $uname, Accession: $accession, Subject: $subject\n";
  334. }
  335. else {
  336. $residues .= trim($line);
  337. }
  338. }
  339. // now load the last sequence in the file
  340. tripal_feature_fasta_loader_insert_feature($name,$uname,$db_id,
  341. $accession,$subject,$rel_type,$parent_type,$library_id,$organism_id,$type,
  342. $source,$residues,$update,$re_name);
  343. return '';
  344. }
  345. /**
  346. *
  347. *
  348. * @ingroup fasta_loader
  349. */
  350. function tripal_feature_fasta_loader_insert_feature($name,$uname,$db_id,$accession,
  351. $parent,$rel_type,$parent_type,$library_id,$organism_id,$type,
  352. $source,$residues,$update,$re_name)
  353. {
  354. $previous_db = tripal_db_set_active('chado');
  355. // first get the type for this sequence
  356. $cvtermsql = "SELECT CVT.cvterm_id
  357. FROM {cvterm} CVT
  358. INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  359. LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
  360. WHERE cv.name = '%s' and (CVT.name = '%s' or CVTS.synonym = '%s')";
  361. $cvterm = db_fetch_object(db_query($cvtermsql,'sequence',$type,$type));
  362. if(!$cvterm){
  363. print "ERROR: cannot find the term type: '$type'\n";
  364. return 0;
  365. }
  366. // check to see if this feature already exists
  367. $feature_sql = "SELECT * FROM {feature}
  368. WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
  369. $feature = db_fetch_object(db_query($feature_sql,$organism_id,$uname,$cvterm->cvterm_id));
  370. if(!$feature){
  371. // now insert the feature
  372. $sql = "INSERT INTO {feature} (organism_id, name, uniquename, residues, seqlen, md5checksum,type_id,is_analysis,is_obsolete)
  373. VALUES(%d,'%s','%s','%s',%d, '%s', %d, %s, %s)";
  374. $result = db_query($sql,$organism_id,$name,$uname,$residues,strlen($residues),
  375. md5($residues),$cvterm->cvterm_id,'false','false');
  376. if(!$result){
  377. print "ERROR: failed to insert feature '$name ($uname)'\n";
  378. return 0;
  379. } else {
  380. print "Inserted feature $name ($uname)\n";
  381. }
  382. } else {
  383. if($update){
  384. // we do not want to wipe out the name if the user did not intend for this to
  385. // happen. The uniquename must match the sequence but the name may not.
  386. // so, we'll only update the name if the users specified an 're_name' regular
  387. // expression.
  388. if($re_name){
  389. $sql = "UPDATE {feature}
  390. SET name = '%s', residues = '%s', seqlen = '%s', md5checksum = '%s'
  391. WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
  392. $result = db_query($sql,$name,$residues,strlen($residues),md5($residues),$organism_id,$uname,$cvterm->cvterm_id);
  393. } else {
  394. $sql = "UPDATE {feature}
  395. SET residues = '%s', seqlen = '%s', md5checksum = '%s'
  396. WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
  397. $result = db_query($sql,$residues,strlen($residues),md5($residues),$organism_id,$uname,$cvterm->cvterm_id);
  398. }
  399. if(!$result){
  400. print "ERROR: failed to update feature '$name ($uname)'\n";
  401. return 0;
  402. } else {
  403. print "Updated feature $name ($uname)\n";
  404. }
  405. } else {
  406. print "WARNING: feature already exists, skipping: '$name ($uname)'\n";
  407. }
  408. }
  409. // now get the feature
  410. $feature = db_fetch_object(db_query($feature_sql,$organism_id,$uname,$cvterm->cvterm_id));
  411. if(!$feature){
  412. print "Something bad has happened: $organism_id, $uname, $cvterm->cvterm_id\n";
  413. return 0;
  414. }
  415. // now add the database cross reference
  416. if($db_id){
  417. // check to see if this accession reference exists, if not add it
  418. $dbxrefsql = "SELECT * FROM {dbxref} WHERE db_id = %s and accession = '%s'";
  419. $dbxref = db_fetch_object(db_query($dbxrefsql,$db_id,$accession));
  420. if(!$dbxref){
  421. $sql = "INSERT INTO {dbxref} (db_id,accession) VALUES (%d,'%s')";
  422. $result = db_query($sql,$db_id,$accession);
  423. if(!$result){
  424. print "WARNING: could not add external database acession: '$name accession: $accession'\n";
  425. }
  426. $dbxref = db_fetch_object(db_query($dbxrefsql,$db_id,$accession));
  427. }
  428. // check to see if the feature dbxref record exists if not, then add it
  429. $fdbxrefsql = "SELECT * FROM {feature_dbxref} WHERE feature_id = %d and dbxref_id = %d";
  430. $fdbxref = db_fetch_object(db_query($fdbxrefsql,$feature->feature_id,$dbxref->dbxref_id));
  431. if(!$fdbxref){
  432. $sql = "INSERT INTO {feature_dbxref} (feature_id,dbxref_id) VALUES (%d,%d)";
  433. $result = db_query($sql,$feature->feature_id,$dbxref->dbxref_id);
  434. if(!$result){
  435. print "WARNING: could not associate database cross reference with feature: '$name accession: $accession'\n";
  436. } else {
  437. print "Added database crossreference $name ($uname) -> $accession\n";
  438. }
  439. }
  440. }
  441. // now add in the relationship if one exists. First, get the parent type for the relationship
  442. // then get the parent feature
  443. if($rel_type){
  444. $parentcvterm = db_fetch_object(db_query($cvtermsql,'sequence',$parent_type,$parent_type));
  445. $relcvterm = db_fetch_object(db_query($cvtermsql,'relationship',$rel_type,$rel_type));
  446. $parent_feature = db_fetch_object(db_query($feature_sql,$organism_id,$parent,$parentcvterm->cvterm_id));
  447. if($parent_feature){
  448. // check to see if the relationship already exists
  449. $sql = "SELECT * FROM {feature_relationship} WHERE subject_id = %d and object_id = %d and type_id = %d";
  450. $rel = db_fetch_object(db_query($sql,$feature->feature_id,$parent_feature->feature_id,$relcvterm->cvterm_id));
  451. if($rel){
  452. print "WARNING: relationship already exists, skipping '$uname' ($type) $rel_type '$parent' ($parent_type)\n";
  453. } else {
  454. $sql = "INSERT INTO {feature_relationship} (subject_id,object_id,type_id)
  455. VALUES (%d,%d,%d)";
  456. $result = db_query($sql,$feature->feature_id,$parent_feature->feature_id,$relcvterm->cvterm_id);
  457. if(!$result){
  458. print "WARNING: failed to insert feature relationship '$uname' ($type) $rel_type '$parent' ($parent_type)\n";
  459. } else {
  460. print "Inserted relationship relationship: '$uname' ($type) $rel_type '$parent' ($parent_type)\n";
  461. }
  462. }
  463. }
  464. else {
  465. print "WARNING: cannot establish relationship '$uname' ($type) $rel_type '$parent' ($parent_type): Cannot find the parent\n";
  466. }
  467. }
  468. tripal_db_set_active($previous_db);
  469. }