tripal_chado.fasta_loader.inc 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004
  1. <?php
  2. /**
  3. * @file
  4. * Provides fasta loading functionality. Creates features based on their specification
  5. * in a fasta file.
  6. */
  7. /**
  8. * @defgroup fasta_loader FASTA Feature Loader
  9. * @ingroup tripal_chado
  10. * @{
  11. * Provides fasta loading functionality.
  12. * Creates features based on their specification
  13. * in a fasta file.
  14. * @}
  15. */
  16. /**
  17. * The form to submit a fasta loading job
  18. *
  19. * @ingroup fasta_loader
  20. */
  21. function tripal_feature_fasta_load_form() {
  22. $form['fasta_file'] = array('#type' => 'textfield','#title' => t('FASTA File'),
  23. '#description' => t('Please enter the full system path for the FASTA file, or a path within the Drupal
  24. installation (e.g. /sites/default/files/xyz.obo). The path must be accessible to the
  25. server on which this Drupal instance is running.'),'#required' => TRUE
  26. );
  27. // get the list of organisms
  28. $sql = "SELECT * FROM {organism} ORDER BY genus, species";
  29. $org_rset = chado_query($sql);
  30. $organisms = array();
  31. $organisms[''] = '';
  32. while ($organism = $org_rset->fetchObject()) {
  33. $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
  34. }
  35. $form['organism_id'] = array('#title' => t('Organism'),'#type' => t('select'),
  36. '#description' => t("Choose the organism to which these sequences are associated"),
  37. '#required' => TRUE,'#options' => $organisms
  38. );
  39. // get the sequence ontology CV ID
  40. $values = array('name' => 'sequence');
  41. $cv = chado_select_record('cv', array('cv_id'), $values);
  42. $cv_id = $cv[0]->cv_id;
  43. $form['seqtype'] = array('#type' => 'textfield','#title' => t('Sequence Type'),
  44. '#required' => TRUE,
  45. '#description' => t('Please enter the Sequence Ontology (SO) term name that describes the sequences in the FASTA file (e.g. gene, mRNA, polypeptide, etc...)'),
  46. '#autocomplete_path' => "admin/tripal/storage/chado/auto_name/cvterm/$cv_id"
  47. );
  48. $form['method'] = array('#type' => 'radios','#title' => 'Method','#required' => TRUE,
  49. '#options' => array(t('Insert only'),t('Update only'),t('Insert and update')
  50. ),
  51. '#description' => t('Select how features in the FASTA file are handled.
  52. Select "Insert only" to insert the new features. If a feature already
  53. exists with the same name or unique name and type then it is skipped.
  54. Select "Update only" to only update featues that already exist in the
  55. database. Select "Insert and Update" to insert features that do
  56. not exist and upate those that do.'),'#default_value' => 2
  57. );
  58. $form['match_type'] = array('#type' => 'radios','#title' => 'Name Match Type','#required' => TRUE,
  59. '#options' => array(t('Name'),t('Unique name')
  60. ),
  61. '#description' => t('Used for "updates only" or "insert and update" methods. Not required if method type is "insert".
  62. Feature data is stored in Chado with both a human-readable
  63. name and a unique name. If the features in your FASTA file are uniquely identified using
  64. a human-readable name then select the "Name" button. If your features are
  65. uniquely identified using the unique name then select the "Unique name" button. If you
  66. loaded your features first using the GFF loader then the unique name of each
  67. features were indicated by the "ID=" attribute and the name by the "Name=" attribute.
  68. By default, the FASTA loader will use the first word (character string
  69. before the first space) as the name for your feature. If
  70. this does not uniquely identify your feature consider specifying a regular expression in the advanced section below.
  71. Additionally, you may import both a name and a unique name for each sequence using the advanced options.'),
  72. '#default_value' => 1
  73. );
  74. $form['analysis'] = array('#type' => 'fieldset','#title' => t('Analysis Used to Derive Features'),
  75. '#collapsed' => TRUE
  76. );
  77. $form['analysis']['desc'] = array(
  78. '#markup' => t("Why specify an analysis for a data load? All data comes
  79. from some place, even if downloaded from Genbank. By specifying
  80. analysis details for all data uploads, it allows an end user to reproduce the
  81. data set, but at least indicates the source of the data.")
  82. );
  83. // get the list of organisms
  84. $sql = "SELECT * FROM {analysis} ORDER BY name";
  85. $org_rset = chado_query($sql);
  86. $analyses = array();
  87. $analyses[''] = '';
  88. while ($analysis = $org_rset->fetchObject()) {
  89. $analyses[$analysis->analysis_id] = "$analysis->name ($analysis->program $analysis->programversion, $analysis->sourcename)";
  90. }
  91. $form['analysis']['analysis_id'] = array('#title' => t('Analysis'),'#type' => t('select'),
  92. '#description' => t("Choose the analysis to which these features are associated"),
  93. '#required' => TRUE,'#options' => $analyses
  94. );
  95. // Advanced Options
  96. $form['advanced'] = array('#type' => 'fieldset','#title' => t('Advanced Options'),
  97. '#collapsible' => TRUE,'#collapsed' => TRUE
  98. );
  99. $form['advanced']['re_help'] = array('#type' => 'item',
  100. '#value' => t('A regular expression is an advanced method for extracting information from a string of text.
  101. Your FASTA file may contain both a human-readable name and a unique name for each sequence.
  102. If you want to import
  103. both the name and unique name for all sequences, then you must provide regular expressions
  104. so that the loader knows how to separate them.
  105. Otherwise the name and uniquename will be the same.
  106. By default, this loader will use the first word in the definition
  107. lines of the FASTA file
  108. as the name or unique name of the feature.')
  109. );
  110. $form['advanced']['re_name'] = array('#type' => 'textfield',
  111. '#title' => t('Regular expression for the name'),'#required' => FALSE,
  112. '#description' => t('Enter the regular expression that will extract the
  113. feature name from the FASTA definition line. For example, for a
  114. defintion line with a name and unique name separated by a bar \'|\' (>seqname|uniquename),
  115. the regular expression for the name would be, "^(.*?)\|.*$". All FASTA
  116. definition lines begin with the ">" symbol. You do not need to incldue
  117. this symbol in your regular expression.')
  118. );
  119. $form['advanced']['re_uname'] = array('#type' => 'textfield',
  120. '#title' => t('Regular expression for the unique name'),'#required' => FALSE,
  121. '#description' => t('Enter the regular expression that will extract the
  122. feature name from the FASTA definition line. For example, for a
  123. defintion line with a name and unique name separated by a bar \'|\' (>seqname|uniquename),
  124. the regular expression for the unique name would be "^.*?\|(.*)$"). All FASTA
  125. definition lines begin with the ">" symbol. You do not need to incldue
  126. this symbol in your regular expression.')
  127. );
  128. // Advanced database cross reference options.
  129. $form['advanced']['db'] = array('#type' => 'fieldset',
  130. '#title' => t('External Database Reference'),'#weight' => 6,'#collapsed' => TRUE
  131. );
  132. $form['advanced']['db']['re_accession'] = array('#type' => 'textfield',
  133. '#title' => t('Regular expression for the accession'),'#required' => FALSE,
  134. '#description' => t('Enter the regular expression that will extract the accession for the external database for each feature from the FASTA definition line.'),
  135. '#weight' => 2
  136. );
  137. // get the list of databases
  138. $sql = "SELECT * FROM {db} ORDER BY name";
  139. $db_rset = chado_query($sql);
  140. $dbs = array();
  141. $dbs[''] = '';
  142. while ($db = $db_rset->fetchObject()) {
  143. $dbs[$db->db_id] = "$db->name";
  144. }
  145. $form['advanced']['db']['db_id'] = array('#title' => t('External Database'),
  146. '#type' => t('select'),
  147. '#description' => t("Plese choose an external database for which these sequences have a cross reference."),
  148. '#required' => FALSE,'#options' => $dbs,'#weight' => 1
  149. );
  150. $form['advanced']['relationship'] = array('#type' => 'fieldset','#title' => t('Relationships'),
  151. '#weight' => 6,'#collapsed' => TRUE
  152. );
  153. $rels = array();
  154. $rels[''] = '';
  155. $rels['part_of'] = 'part of';
  156. $rels['derives_from'] = 'produced by (derives from)';
  157. // Advanced references options
  158. $form['advanced']['relationship']['rel_type'] = array('#title' => t('Relationship Type'),
  159. '#type' => t('select'),
  160. '#description' => t("Use this option to create associations, or relationships between the
  161. features of this FASTA file and existing features in the database. For
  162. example, to associate a FASTA file of peptides to existing genes or transcript sequence,
  163. select the type 'produced by'. For a CDS sequences select the type 'part of'"),
  164. '#required' => FALSE,'#options' => $rels,'#weight' => 5
  165. );
  166. $form['advanced']['relationship']['re_subject'] = array('#type' => 'textfield',
  167. '#title' => t('Regular expression for the parent'),'#required' => FALSE,
  168. '#description' => t('Enter the regular expression that will extract the unique
  169. name needed to identify the existing sequence for which the
  170. relationship type selected above will apply.'),'#weight' => 6
  171. );
  172. $form['advanced']['relationship']['parent_type'] = array('#type' => 'textfield',
  173. '#title' => t('Parent Type'),'#required' => FALSE,
  174. '#description' => t('Please enter the Sequence Ontology term for the parent. For example
  175. if the FASTA file being loaded is a set of proteins that are
  176. products of genes, then use the SO term \'gene\' or \'transcript\' or equivalent. However,
  177. this type must match the type for already loaded features.'),
  178. '#weight' => 7
  179. );
  180. $form['button'] = array('#type' => 'submit','#value' => t('Import FASTA file'),'#weight' => 10
  181. );
  182. return $form;
  183. }
  184. /**
  185. * Validate the fasta loader job form
  186. *
  187. * @ingroup fasta_loader
  188. */
  189. function tripal_feature_fasta_load_form_validate($form, &$form_state) {
  190. $fasta_file = trim($form_state['values']['fasta_file']);
  191. $organism_id = $form_state['values']['organism_id'];
  192. $type = trim($form_state['values']['seqtype']);
  193. $method = trim($form_state['values']['method']);
  194. $match_type = trim($form_state['values']['match_type']);
  195. $re_name = trim($form_state['values']['re_name']);
  196. $re_uname = trim($form_state['values']['re_uname']);
  197. $re_accession = trim($form_state['values']['re_accession']);
  198. $db_id = $form_state['values']['db_id'];
  199. $rel_type = $form_state['values']['rel_type'];
  200. $re_subject = trim($form_state['values']['re_subject']);
  201. $parent_type = trim($form_state['values']['parent_type']);
  202. if ($method == 0) {
  203. $method = 'Insert only';
  204. }
  205. if ($method == 1) {
  206. $method = 'Update only';
  207. }
  208. if ($method == 2) {
  209. $method = 'Insert and update';
  210. }
  211. if ($match_type == 0) {
  212. $match_type = 'Name';
  213. }
  214. if ($match_type == 1) {
  215. $match_type = 'Unique name';
  216. }
  217. if ($re_name and !$re_uname and strcmp($match_type, 'Unique name') == 0) {
  218. form_set_error('re_uname', t("You must provide a regular expression to identify the sequence unique name"));
  219. }
  220. if (!$re_name and $re_uname and strcmp($match_type, 'Name') == 0) {
  221. form_set_error('re_name', t("You must provide a regular expression to identify the sequence name"));
  222. }
  223. // check to see if the file is located local to Drupal
  224. $fasta_file = trim($fasta_file);
  225. $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $fasta_file;
  226. if (!file_exists($dfile)) {
  227. // if not local to Drupal, the file must be someplace else, just use
  228. // the full path provided
  229. $dfile = $fasta_file;
  230. }
  231. if (!file_exists($dfile)) {
  232. form_set_error('fasta_file', t("Cannot find the file on the system. Check that the file exists or that the web server has permissions to read the file."));
  233. }
  234. // make sure if a relationship is specified that all fields are provided.
  235. if (($rel_type or $parent_type) and !$re_subject) {
  236. form_set_error('re_subject', t("Please provide a regular expression for the parent"));
  237. }
  238. if (($rel_type or $re_subject) and !$parent_type) {
  239. form_set_error('parent_type', t("Please provide a SO term for the parent"));
  240. }
  241. if (($parent_type or $re_subject) and !$rel_type) {
  242. form_set_error('rel_type', t("Please select a relationship type"));
  243. }
  244. // make sure if a database is specified that all fields are provided
  245. if ($db_id and !$re_accession) {
  246. form_set_error('re_accession', t("Please provide a regular expression for the accession"));
  247. }
  248. if ($re_accession and !$db_id) {
  249. form_set_error('db_id', t("Please select a database"));
  250. }
  251. // check to make sure the types exists
  252. $cvtermsql = "SELECT CVT.cvterm_id
  253. FROM {cvterm} CVT
  254. INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  255. LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
  256. WHERE cv.name = :cvname and (CVT.name = :name or CVTS.synonym = :synonym)";
  257. $cvterm = chado_query($cvtermsql, array(':cvname' => 'sequence',':name' => $type,
  258. ':synonym' => $type
  259. ))->fetchObject();
  260. if (!$cvterm) {
  261. form_set_error('type', t("The Sequence Ontology (SO) term selected for the sequence type is not available in the database. Please check spelling or select another."));
  262. }
  263. if ($rel_type) {
  264. $cvterm = chado_query($cvtermsql, array(':cvname' => 'sequence',':name' => $parent_type,
  265. ':synonym' => $parent_type
  266. ))->fetchObject();
  267. if (!$cvterm) {
  268. form_set_error('parent_type', t("The Sequence Ontology (SO) term selected for the parent relationship is not available in the database. Please check spelling or select another."));
  269. }
  270. }
  271. // check to make sure the 'relationship' and 'sequence' ontologies are loaded
  272. $form_state['storage']['dfile'] = $dfile;
  273. }
  274. /**
  275. * Submit a fasta loading job
  276. *
  277. * @ingroup fasta_loader
  278. */
  279. function tripal_feature_fasta_load_form_submit($form, &$form_state) {
  280. global $user;
  281. $dfile = $form_state['storage']['dfile'];
  282. $organism_id = $form_state['values']['organism_id'];
  283. $type = trim($form_state['values']['seqtype']);
  284. $method = trim($form_state['values']['method']);
  285. $match_type = trim($form_state['values']['match_type']);
  286. $re_name = trim($form_state['values']['re_name']);
  287. $re_uname = trim($form_state['values']['re_uname']);
  288. $re_accession = trim($form_state['values']['re_accession']);
  289. $db_id = $form_state['values']['db_id'];
  290. $rel_type = $form_state['values']['rel_type'];
  291. $re_subject = trim($form_state['values']['re_subject']);
  292. $parent_type = trim($form_state['values']['parent_type']);
  293. $analysis_id = $form_state['values']['analysis_id'];
  294. if ($method == 0) {
  295. $method = 'Insert only';
  296. }
  297. if ($method == 1) {
  298. $method = 'Update only';
  299. }
  300. if ($method == 2) {
  301. $method = 'Insert and update';
  302. }
  303. if ($match_type == 0) {
  304. $match_type = 'Name';
  305. }
  306. if ($match_type == 1) {
  307. $match_type = 'Unique name';
  308. }
  309. $args = array($dfile,$organism_id,$type,$re_name,$re_uname,$re_accession,$db_id,$rel_type,
  310. $re_subject,$parent_type,$method,$user->uid,$analysis_id,$match_type
  311. );
  312. $fname = preg_replace("/.*\/(.*)/", "$1", $dfile);
  313. $includes = array(
  314. module_load_include('inc', 'tripal_chado', 'includes/loaders/tripal_chado.fasta_loader'),
  315. );
  316. tripal_add_job("Import FASTA file: $fname", 'tripal_chado', 'tripal_feature_load_fasta', $args, $user->uid, 10, $includes);
  317. }
  318. /**
  319. * Actually load a fasta file.
  320. * This is the function called by tripal jobs
  321. *
  322. * @param $dfile The
  323. * full path to the fasta file to load
  324. * @param $organism_id The
  325. * organism_id of the organism these features are from
  326. * @param $type The
  327. * type of features contained in the fasta file
  328. * @param $re_name A
  329. * regular expression to extract the feature.name from the fasta header
  330. * @param $re_uname A
  331. * regular expression to extract the feature.uniquename from the fasta header
  332. * @param $re_accession A
  333. * regular expression to extract the accession of the feature.dbxref_id
  334. * @param $db_id The
  335. * db_id of the above dbxref
  336. * @param $rel_type The
  337. * type of relationship when creating a feature_relationship between this
  338. * feature (object) and an extracted subject
  339. * @param $re_subject The
  340. * regular expression to extract the uniquename of the feature to be the subject
  341. * of the above specified relationship
  342. * @param $parent_type The
  343. * type of the parent feature
  344. * @param $method The
  345. * method of feature adding. (ie: 'Insert only', 'Update only', 'Insert and update')
  346. * @param $uid The
  347. * user id of the user who submitted the job
  348. * @param $analysis_id The
  349. * analysis_id to associate the features in this fasta file with
  350. * @param $match_type Whether
  351. * to match existing features based on the 'Name' or 'Unique name'
  352. * @param $job =
  353. * NULL
  354. * The tripal job
  355. *
  356. * @ingroup fasta_loader
  357. */
  358. function tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_uname, $re_accession,
  359. $db_id, $rel_type, $re_subject, $parent_type, $method, $uid, $analysis_id, $match_type,
  360. $job = NULL) {
  361. $transaction = db_transaction();
  362. print "\nNOTE: Loading of this GFF file is performed using a database transaction. \n" .
  363. "If the load fails or is terminated prematurely then the entire set of \n" .
  364. "insertions/updates is rolled back and will not be found in the database\n\n";
  365. try {
  366. // First get the type for this sequence.
  367. $cvtermsql = "
  368. SELECT CVT.cvterm_id
  369. FROM {cvterm} CVT
  370. INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  371. LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
  372. WHERE cv.name = :cvname and (CVT.name = :name or CVTS.synonym = :synonym)
  373. ";
  374. $cvterm = chado_query($cvtermsql, array(':cvname' => 'sequence',':name' => $type,':synonym' => $type))->fetchObject();
  375. if (!$cvterm) {
  376. tripal_report_error("T_fasta_loader", TRIPAL_ERROR,
  377. "Cannot find the term type: '%type'", array('%type' => $type));
  378. return 0;
  379. }
  380. // Second, if there is a parent type then get that.
  381. if ($parent_type) {
  382. $parentcvterm = chado_query($cvtermsql, array(':cvname' => 'sequence', ':name' => $parent_type,':synonym' => $parent_type))->fetchObject();
  383. if (!$parentcvterm) {
  384. tripal_report_error("T_fasta_loader", TRIPAL_ERROR, "Cannot find the paretne term type: '%type'", array(
  385. '%type' => $parentcvterm
  386. ));
  387. return 0;
  388. }
  389. }
  390. // Third, if there is a relationship type then get that.
  391. if ($rel_type) {
  392. $relcvterm = chado_query($cvtermsql, array(':cvname' => 'sequence',':name' => $rel_type,':synonym' => $rel_type))->fetchObject();
  393. if (!$relcvterm) {
  394. tripal_report_error("T_fasta_loader", TRIPAL_ERROR, "Cannot find the relationship term type: '%type'", array(
  395. '%type' => $relcvterm
  396. ));
  397. return 0;
  398. }
  399. }
  400. // We need to get the table schema to make sure we don't overrun the
  401. // size of fields with what our regular expressions retrieve
  402. $feature_tbl = chado_get_schema('feature');
  403. $dbxref_tbl = chado_get_schema('dbxref');
  404. print "Step 1: finding sequences\n";
  405. $filesize = filesize($dfile);
  406. $fh = fopen($dfile, 'r');
  407. if (!$fh) {
  408. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "cannot open file: %dfile", array(
  409. '%dfile' => $dfile
  410. ));
  411. return 0;
  412. }
  413. // Calculate the interval at which we will print to the screen that status.
  414. $interval = intval($filesize * 0.01);
  415. if ($interval < 1) {
  416. $interval = 1;
  417. }
  418. $inv_read = 0;
  419. $num_read = 0;
  420. // Iterate through the lines of the file. Keep a record for
  421. // where in the file each line is at for later import.
  422. $seqs = array();
  423. $num_seqs = 0;
  424. $prev_pos = 0;
  425. $set_start = FALSE;
  426. while ($line = fgets($fh)) {
  427. $num_read += strlen($line);
  428. $intv_read += strlen($line);
  429. // If we encounter a definition line then get the name, uniquename,
  430. // accession and relationship subject from the definition line.
  431. if (preg_match('/^>/', $line)) {
  432. // Remove the > symbol from the defline.
  433. $defline = preg_replace("/^>/", '', $line);
  434. // Get the feature name if a regular expression is provided.
  435. if ($re_name) {
  436. if (!preg_match("/$re_name/", $defline, $matches)) {
  437. tripal_report_error('trp-fasta', "ERROR: Regular expression for the feature name finds nothing. Line %line.", array(
  438. '%line' => $i
  439. ), 'error');
  440. }
  441. elseif (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
  442. tripal_report_error('trp-fasta', "WARNING: Regular expression retrieves a value too long for the feature name. Line %line.", array(
  443. '%line' => $i
  444. ), 'error');
  445. }
  446. else {
  447. $name = trim($matches[1]);
  448. }
  449. }
  450. // If the match_type is name and no regular expression was provided
  451. // then use the first word as the name, otherwise we don't set the name.
  452. elseif (strcmp($match_type, 'Name') == 0) {
  453. if (preg_match("/^\s*(.*?)[\s\|].*$/", $defline, $matches)) {
  454. if (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
  455. tripal_report_error('trp-fasta', "WARNING: Regular expression retrieves a feature name too long for the feature name. Line %line.", array(
  456. '%line' => $i), 'error');
  457. }
  458. else {
  459. $name = trim($matches[1]);
  460. }
  461. }
  462. else {
  463. tripal_report_error('trp-fasta', "ERROR: Cannot find a feature name. Line %line.", array(
  464. '%line' => $i), 'error');
  465. }
  466. }
  467. // Get the feature uniquename if a regular expression is provided.
  468. if ($re_uname) {
  469. if (!preg_match("/$re_uname/", $defline, $matches)) {
  470. tripal_report_error('trp-fasta', "ERROR: Regular expression for the feature unique name finds nothing. Line %line.", array(
  471. '%line' => $i), 'error');
  472. }
  473. $uname = trim($matches[1]);
  474. }
  475. // If the match_type is name and no regular expression was provided
  476. // then use the first word as the name, otherwise, we don't set the
  477. // unqiuename.
  478. elseif (strcmp($match_type, 'Unique name') == 0) {
  479. if (preg_match("/^\s*(.*?)[\s\|].*$/", $defline, $matches)) {
  480. $uname = trim($matches[1]);
  481. }
  482. else {
  483. tripal_report_error('trp-fasta', "ERROR: Cannot find a feature unique name. Line %line.", array(
  484. '%line' => $i), 'error');
  485. }
  486. }
  487. // Get the accession if a regular expression is provided.
  488. preg_match("/$re_accession/", $defline, $matches);
  489. if (strlen($matches[1]) > $dbxref_tbl['fields']['accession']['length']) {
  490. tripal_report_error('trp-fasta', "WARNING: Regular expression retrieves an accession too long for the feature name. " .
  491. "Cannot add cross reference. Line %line.", array('%line' => $i
  492. ), 'warning');
  493. }
  494. else {
  495. $accession = trim($matches[1]);
  496. }
  497. // Get the relationship subject
  498. preg_match("/$re_subject/", $line, $matches);
  499. $subject = trim($matches[1]);
  500. // Add the details to the sequence.
  501. $seqs[$num_seqs] = array(
  502. 'name' => $name,
  503. 'uname' => $uname,
  504. 'accession' => $accession,
  505. 'subject' => $subject,
  506. 'seq_start' => ftell($fh)
  507. );
  508. $set_start = TRUE;
  509. // If this isn't the first sequence, then we want to specify where
  510. // the previous sequence ended.
  511. if ($num_seqs > 0) {
  512. $seqs[$num_seqs - 1]['seq_end'] = $prev_pos;
  513. }
  514. $num_seqs++;
  515. }
  516. // Keep the current file position so we can use it to set the sequence
  517. // ending position
  518. $prev_pos = ftell($fh);
  519. // update the job status every % bytes
  520. if ($job and $intv_read >= $interval) {
  521. $intv_read = 0;
  522. $percent = sprintf("%.2f", ($num_read / $filesize) * 100);
  523. if ($name) {
  524. print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
  525. " bytes.\r";
  526. }
  527. else {
  528. print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
  529. " bytes.\r";
  530. }
  531. tripal_set_job_progress($job, intval(($num_read / $filesize) * 100));
  532. }
  533. }
  534. $percent = sprintf("%.2f", ($num_read / $filesize) * 100);
  535. print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
  536. " bytes.\r";
  537. tripal_set_job_progress($job, 50);
  538. // Set the end position for the last sequence.
  539. $seqs[$num_seqs - 1]['seq_end'] = $num_read - strlen($line);
  540. // Now that we know where the sequences are in the file we need to add them.
  541. print "\nStep 2: Importing sequences\n";
  542. for ($i = 0; $i < $num_seqs; $i++) {
  543. $seq = $seqs[$i];
  544. print "Importing " . ($i + 1) . " of $num_seqs. ";
  545. if ($name) {
  546. print "Current feature: " . $seq['name'] . ".\n";
  547. }
  548. else {
  549. print "Current feature: " . $seq['uname'] . ".\n";
  550. }
  551. tripal_feature_load_fasta_feature($fh, $seq['name'], $seq['uname'], $db_id, $seq['accession'], $seq['subject'], $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm, $source, $method, $re_name, $match_type, $parentcvterm, $relcvterm, $seq['seq_start'], $seq['seq_end']);
  552. }
  553. tripal_set_job_progress($job, 100);
  554. fclose($fh);
  555. }
  556. catch (Exception $e) {
  557. fclose($fh);
  558. $transaction->rollback();
  559. print "\n"; // make sure we start errors on new line
  560. watchdog_exception('T_fasta_loader', $e);
  561. print "FAILED: Rolling back database changes...\n";
  562. }
  563. print "\nDone\n";
  564. }
  565. /**
  566. * A helper function for tripal_feature_load_fasta() to load a single feature
  567. *
  568. * @ingroup fasta_loader
  569. */
  570. function tripal_feature_load_fasta_feature($fh, $name, $uname, $db_id, $accession, $parent,
  571. $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm, $source, $method, $re_name,
  572. $match_type, $parentcvterm, $relcvterm, $seq_start, $seq_end) {
  573. // Check to see if this feature already exists if the match_type is 'Name'.
  574. if (strcmp($match_type, 'Name') == 0) {
  575. $values = array('organism_id' => $organism_id,'name' => $name,'type_id' => $cvterm->cvterm_id
  576. );
  577. $results = chado_select_record('feature', array('feature_id'
  578. ), $values);
  579. if (count($results) > 1) {
  580. tripal_report_error('T_fasta_loader', "Multiple features exist with the name '%name' of type
  581. '%type' for the organism. skipping", array('%name' => $name,'%type' => $type));
  582. return 0;
  583. }
  584. if (count($results) == 1) {
  585. $feature = $results[0];
  586. }
  587. }
  588. // Check if this feature already exists if the match_type is 'Unique Name'.
  589. if (strcmp($match_type, 'Unique name') == 0) {
  590. $values = array(
  591. 'organism_id' => $organism_id,
  592. 'uniquename' => $uname,
  593. 'type_id' => $cvterm->cvterm_id
  594. );
  595. $results = chado_select_record('feature', array('feature_id'), $values);
  596. if (count($results) > 1) {
  597. tripal_report_error('T_fasta_loader', TRIPAL_WARNING, "Multiple features exist with the name '%name' of type '%type' for the organism. skipping", array(
  598. '%name' => $name,'%type' => $type));
  599. return 0;
  600. }
  601. if (count($results) == 1) {
  602. $feature = $results[0];
  603. }
  604. // If the feature exists but this is an "insert only" then skip.
  605. if ($feature and (strcmp($method, 'Insert only') == 0)) {
  606. tripal_report_error('T_fasta_loader', TRIPAL_WARNING, "Feature already exists '%name' ('%uname') while matching on %type. Skipping insert.", array(
  607. '%name' => $name,'%uname' => $uname,'%type' => drupal_strtolower($match_type)
  608. ));
  609. return 0;
  610. }
  611. }
  612. // If we don't have a feature and we're doing an insert then do the insert.
  613. $inserted = 0;
  614. if (!$feature and (strcmp($method, 'Insert only') == 0 or strcmp($method, 'Insert and update') == 0)) {
  615. // If we have a unique name but not a name then set them to be the same
  616. if (!$uname) {
  617. $uname = $name;
  618. }
  619. elseif (!$name) {
  620. $name = $uname;
  621. }
  622. // Insert the feature record.
  623. $values = array(
  624. 'organism_id' => $organism_id,
  625. 'name' => $name,
  626. 'uniquename' => $uname,
  627. 'type_id' => $cvterm->cvterm_id
  628. );
  629. $success = chado_insert_record('feature', $values);
  630. if (!$success) {
  631. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to insert feature '%name (%uname)'", array(
  632. '%name' => $name,'%uname' => $numane));
  633. return 0;
  634. }
  635. // now get the feature we just inserted
  636. $values = array(
  637. 'organism_id' => $organism_id,
  638. 'uniquename' => $uname,
  639. 'type_id' => $cvterm->cvterm_id
  640. );
  641. $results = chado_select_record('feature', array('feature_id'), $values);
  642. if (count($results) == 1) {
  643. $inserted = 1;
  644. $feature = $results[0];
  645. }
  646. else {
  647. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to retreive newly inserted feature '%name (%uname)'", array(
  648. '%name' => $name,'%uname' => $numane));
  649. return 0;
  650. }
  651. // Add the residues for this feature
  652. tripal_feature_load_fasta_residues($fh, $feature->feature_id, $seq_start, $seq_end);
  653. }
  654. // if we don't have a feature and the user wants to do an update then fail
  655. if (!$feature and (strcmp($method, 'Update only') == 0 or
  656. drupal_strcmp($method, 'Insert and update') == 0)) {
  657. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to find feature '%name' ('%uname') while matching on " .
  658. drupal_strtolower($match_type), array('%name' => $name,'%uname' => $uname));
  659. return 0;
  660. }
  661. // if we do have a feature and this is an update then proceed with the update
  662. if ($feature and !$inserted and (strcmp($method, 'Update only') == 0 or
  663. strcmp($method, 'Insert and update') == 0)) {
  664. // if the user wants to match on the Name field
  665. if (strcmp($match_type, 'Name') == 0) {
  666. // if we're matching on the name but do not have a unique name then we
  667. // don't want to update the uniquename.
  668. $values = array();
  669. if ($uname) {
  670. // First check to make sure that by changing the unique name of this
  671. // feature that we won't conflict with another existing feature of
  672. // the same name
  673. $values = array(
  674. 'organism_id' => $organism_id,
  675. 'uniquename' => $uname,
  676. 'type_id' => $cvterm->cvterm_id
  677. );
  678. $results = chado_select_record('feature', array('feature_id'
  679. ), $values);
  680. if (count($results) > 0) {
  681. tripal_report_error('T_fasta_loader', "Cannot update the feature '%name' with a uniquename of '%uname' and type of '%type' as it
  682. conflicts with an existing feature with the same uniquename and type.", array(
  683. '%name' => $name,'%uname' => $uname,'%type' => $type
  684. ));
  685. return 0;
  686. }
  687. // the changes to the uniquename don't conflict so proceed with the update
  688. $values = array('uniquename' => $uname);
  689. $match = array(
  690. 'name' => $name,
  691. 'organism_id' => $organism_id,
  692. 'type_id' => $cvterm->cvterm_id
  693. );
  694. // perform the update
  695. $success = chado_update_record('feature', $match, $values);
  696. if (!$success) {
  697. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to update feature '%name' ('%name')", array(
  698. '%name' => $name,'%uiname' => $uname
  699. ));
  700. return 0;
  701. }
  702. }
  703. }
  704. // If the user wants to match on the unique name field.
  705. if (strcmp($match_type, 'Unique name') == 0) {
  706. // If we're matching on the uniquename and have a new name then
  707. // we want to update the name.
  708. $values = array();
  709. if ($name) {
  710. $values = array('name' => $name);
  711. $match = array(
  712. 'uniquename' => $uname,
  713. 'organism_id' => $organism_id,
  714. 'type_id' => $cvterm->cvterm_id
  715. );
  716. $success = chado_update_record('feature', $match, $values);
  717. if (!$success) {
  718. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to update feature '%name' ('%name')", array(
  719. '%name' => $name,'%uiname' => $uname
  720. ));
  721. return 0;
  722. }
  723. }
  724. }
  725. }
  726. // Update the residues for this feature
  727. tripal_feature_load_fasta_residues($fh, $feature->feature_id, $seq_start, $seq_end);
  728. // add in the analysis link
  729. if ($analysis_id) {
  730. // if the association doens't alredy exist then add one
  731. $values = array(
  732. 'analysis_id' => $analysis_id,
  733. 'feature_id' => $feature->feature_id
  734. );
  735. $results = chado_select_record('analysisfeature', array('analysisfeature_id'), $values);
  736. if (count($results) == 0) {
  737. $success = chado_insert_record('analysisfeature', $values);
  738. if (!$success) {
  739. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to associate analysis and feature '%name' ('%name')", array(
  740. '%name' => $name,'%uname' => $uname
  741. ));
  742. return 0;
  743. }
  744. }
  745. }
  746. // now add the database cross reference
  747. if ($db_id) {
  748. // check to see if this accession reference exists, if not add it
  749. $values = array(
  750. 'db_id' => $db_id,
  751. 'accession' => $accession
  752. );
  753. $results = chado_select_record('dbxref', array('dbxref_id'), $values);
  754. // if the accession doesn't exist then add it
  755. if (count($results) == 0) {
  756. $results = chado_insert_record('dbxref', $values);
  757. if (!$results) {
  758. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to add database accession '%accession'", array(
  759. '%accession' => $accession));
  760. return 0;
  761. }
  762. $results = chado_select_record('dbxref', array('dbxref_id'), $values);
  763. if (count($results) == 1) {
  764. $dbxref = $results[0];
  765. }
  766. else {
  767. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to retreive newly inserted dbxref '%name (%uname)'", array(
  768. '%name' => $name,'%uname' => $numane));
  769. return 0;
  770. }
  771. }
  772. else {
  773. $dbxref = $results[0];
  774. }
  775. // check to see if the feature dbxref record exists if not, then add it
  776. $values = array(
  777. 'feature_id' => $feature->feature_id,
  778. 'dbxref_id' => $dbxref->dbxref_id
  779. );
  780. $results = chado_select_record('feature_dbxref', array('feature_dbxref_id'), $values);
  781. if (count($results) == 0) {
  782. $success = chado_insert_record('feature_dbxref', $values);
  783. if (!$success) {
  784. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to add associate database accession '%accession' with feature", array(
  785. '%accession' => $accession
  786. ));
  787. return 0;
  788. }
  789. }
  790. }
  791. // now add in the relationship if one exists. If not, then add it
  792. if ($rel_type) {
  793. $values = array('organism_id' => $organism_id,'uniquename' => $parent,
  794. 'type_id' => $parentcvterm->cvterm_id
  795. );
  796. $results = chado_select_record('feature', array('feature_id'
  797. ), $values);
  798. if (count($results) != 1) {
  799. tripal_report_error('T_fasta_loader', "Cannot find a unique fature for the parent '%parent' of type
  800. '%type' for the feature.", array(
  801. '%parent' => $parent,'%type' => $parent_type
  802. ));
  803. return 0;
  804. }
  805. $parent_feature = $results[0];
  806. // check to see if the relationship already exists if not then add it
  807. $values = array(
  808. 'subject_id' => $feature->feature_id,
  809. 'object_id' => $parent_feature->feature_id,
  810. 'type_id' => $relcvterm->cvterm_id
  811. );
  812. $results = chado_select_record('feature_relationship', array('feature_relationship_id'), $values);
  813. if (count($results) == 0) {
  814. $success = chado_insert_record('feature_relationship', $values);
  815. if (!$success) {
  816. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to add associate database accession '%accession' with feature", array(
  817. '%accession' => $accession
  818. ));
  819. return 0;
  820. }
  821. }
  822. }
  823. }
  824. /**
  825. * Adds the residues column to the feature.
  826. *
  827. * This function seeks to the proper location in the file for the sequence
  828. * and reads in chunks of sequence and appends them to the feature.residues
  829. * column in the database.
  830. *
  831. * @param unknown $fh
  832. * @param unknown $feature_id
  833. * @param unknown $seq_start
  834. * @param unknown $seq_end
  835. */
  836. function tripal_feature_load_fasta_residues($fh, $feature_id, $seq_start, $seq_end) {
  837. // First position the file at the beginning of the sequence
  838. fseek($fh, $seq_start, SEEK_SET);
  839. $chunk_size = 100000000;
  840. $chunk = '';
  841. $seqlen = ($seq_end - $seq_start) + 1;
  842. // Calculate the interval at which we updated the precent complete.
  843. $interval = intval($seqlen * 0.01);
  844. if ($interval < 1) {
  845. $interval = 1;
  846. }
  847. // We don't to repeat the update too often or it slows things down, so
  848. // if the interval is less than 1000 then bring it up to that.
  849. if ($interval < 100000) {
  850. $interval = 100000;
  851. }
  852. $chunk_intv_read = 0;
  853. $intv_read = 0;
  854. $num_read = 0;
  855. $total_seq_size = 0;
  856. // First, make sure we don't have a null in the residues
  857. $sql = "UPDATE {feature} SET residues = '' WHERE feature_id = :feature_id";
  858. chado_query($sql, array(':feature_id' => $feature_id
  859. ));
  860. // Read in the lines until we reach the end of the sequence. Once we
  861. // get a specific bytes read then append the sequence to the one in the
  862. // database.
  863. print "Sequence complete: 0%. Memory: " . number_format(memory_get_usage()) . " bytes. \r";
  864. while ($line = fgets($fh)) {
  865. $num_read += strlen($line) + 1;
  866. $chunk_intv_read += strlen($line) + 1;
  867. $intv_read += strlen($line) + 1;
  868. $chunk .= trim($line);
  869. // If we've read in enough of the sequence then append it to the database.
  870. if ($chunk_intv_read >= $chunk_size) {
  871. $sql = "
  872. UPDATE {feature}
  873. SET residues = residues || :chunk
  874. WHERE feature_id = :feature_id
  875. ";
  876. $success = chado_query($sql, array(':feature_id' => $feature_id,':chunk' => $chunk
  877. ));
  878. if (!$success) {
  879. return FALSE;
  880. }
  881. $total_seq_size += strlen($chunk);
  882. $chunk = '';
  883. $chunk_intv_read = 0;
  884. }
  885. if ($intv_read >= $interval) {
  886. $percent = sprintf("%.2f", ($total_seq_size / $seqlen) * 100);
  887. print "Sequence complete: " . $percent . "%. Memory: " . number_format(memory_get_usage()) .
  888. " bytes. \r";
  889. $intv_read = 0;
  890. }
  891. // If we've reached the ned of the sequence then break out of the loop
  892. if (ftell($fh) == $seq_end) {
  893. break;
  894. }
  895. }
  896. // write the last bit of sequence if it remains
  897. if (strlen($chunk) > 0) {
  898. $sql = "
  899. UPDATE {feature}
  900. SET residues = residues || :chunk
  901. WHERE feature_id = :feature_id
  902. ";
  903. $success = chado_query($sql, array(':feature_id' => $feature_id,':chunk' => $chunk
  904. ));
  905. if (!$success) {
  906. return FALSE;
  907. }
  908. $total_seq_size += strlen($chunk);
  909. $chunk = '';
  910. $chunk_intv_read = 0;
  911. }
  912. // Now update the seqlen and md5checksum fields
  913. $sql = "UPDATE {feature} SET seqlen = char_length(residues), md5checksum = md5(residues) WHERE feature_id = :feature_id";
  914. chado_query($sql, array(':feature_id' => $feature_id
  915. ));
  916. $percent = sprintf("%.2f", ($num_read / $seqlen) * 100);
  917. print "Sequence complete: " . $percent . "%. Memory: " . number_format(memory_get_usage()) .
  918. " bytes. \r";
  919. }