FASTAImporter.inc 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026
  1. <?php
  2. class FASTAImporter extends TripalImporter {
  3. /**
  4. * The name of this loader. This name will be presented to the site
  5. * user.
  6. */
  7. public static $name = 'Chado FASTA Loader';
  8. /**
  9. * The machine name for this loader. This name will be used to construct
  10. * the URL for the loader.
  11. */
  12. public static $machine_name = 'chado_fasta_loader';
  13. /**
  14. * A brief description for this loader. This description will be
  15. * presented to the site user.
  16. */
  17. public static $description = 'Load sequences from a multi-FASTA file into Chado';
  18. /**
  19. * An array containing the extensions of allowed file types.
  20. */
  21. public static $file_types = [
  22. 'fasta',
  23. 'txt',
  24. 'fa',
  25. 'aa',
  26. 'pep',
  27. 'nuc',
  28. 'faa',
  29. 'fna',
  30. ];
  31. /**
  32. * Provides information to the user about the file upload. Typically this
  33. * may include a description of the file types allowed.
  34. */
  35. public static $upload_description = 'Please provide the FASTA file. The file must have a .fasta extension.';
  36. /**
  37. * The title that should appear above the file upload section.
  38. */
  39. public static $upload_title = 'FASTA Upload';
  40. /**
  41. * Text that should appear on the button at the bottom of the importer
  42. * form.
  43. */
  44. public static $button_text = 'Import FASTA file';
  45. /**
  46. * Indicates the methods that the file uploader will support.
  47. */
  48. public static $methods = [
  49. // Allow the user to upload a file to the server.
  50. 'file_upload' => TRUE,
  51. // Allow the user to provide the path on the Tripal server for the file.
  52. 'file_local' => TRUE,
  53. // Allow the user to provide a remote URL for the file.
  54. 'file_remote' => TRUE,
  55. ];
  56. /**
  57. * @see TripalImporter::form()
  58. */
  59. public function form($form, &$form_state) {
  60. // get the list of organisms
  61. $sql = "SELECT * FROM {organism} ORDER BY genus, species";
  62. $org_rset = chado_query($sql);
  63. $organisms = [];
  64. $organisms[''] = '';
  65. while ($organism = $org_rset->fetchObject()) {
  66. $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
  67. }
  68. $form['organism_id'] = [
  69. '#title' => t('Organism'),
  70. '#type' => t('select'),
  71. '#description' => t("Choose the organism to which these sequences are associated"),
  72. '#required' => TRUE,
  73. '#options' => $organisms,
  74. ];
  75. // get the sequence ontology CV ID
  76. $values = ['name' => 'sequence'];
  77. $cv = chado_select_record('cv', ['cv_id'], $values);
  78. $cv_id = $cv[0]->cv_id;
  79. $form['seqtype'] = [
  80. '#type' => 'textfield',
  81. '#title' => t('Sequence Type'),
  82. '#required' => TRUE,
  83. '#description' => t('Please enter the Sequence Ontology (SO) term name that describes the sequences in the FASTA file (e.g. gene, mRNA, polypeptide, etc...)'),
  84. '#autocomplete_path' => "admin/tripal/storage/chado/auto_name/cvterm/$cv_id",
  85. ];
  86. $form['method'] = [
  87. '#type' => 'radios',
  88. '#title' => 'Method',
  89. '#required' => TRUE,
  90. '#options' => [
  91. t('Insert only'),
  92. t('Update only'),
  93. t('Insert and update'),
  94. ],
  95. '#description' => t('Select how features in the FASTA file are handled.
  96. Select "Insert only" to insert the new features. If a feature already
  97. exists with the same name or unique name and type then it is skipped.
  98. Select "Update only" to only update featues that already exist in the
  99. database. Select "Insert and Update" to insert features that do
  100. not exist and upate those that do.'),
  101. '#default_value' => 2,
  102. ];
  103. $form['match_type'] = [
  104. '#type' => 'radios',
  105. '#title' => 'Name Match Type',
  106. '#required' => TRUE,
  107. '#options' => [
  108. t('Name'),
  109. t('Unique name'),
  110. ],
  111. '#description' => t('Used for "updates only" or "insert and update" methods. Not required if method type is "insert".
  112. Feature data is stored in Chado with both a human-readable
  113. name and a unique name. If the features in your FASTA file are uniquely identified using
  114. a human-readable name then select the "Name" button. If your features are
  115. uniquely identified using the unique name then select the "Unique name" button. If you
  116. loaded your features first using the GFF loader then the unique name of each
  117. features were indicated by the "ID=" attribute and the name by the "Name=" attribute.
  118. By default, the FASTA loader will use the first word (character string
  119. before the first space) as the name for your feature. If
  120. this does not uniquely identify your feature consider specifying a regular expression in the advanced section below.
  121. Additionally, you may import both a name and a unique name for each sequence using the advanced options.'),
  122. '#default_value' => 1,
  123. ];
  124. // Additional Options
  125. $form['additional'] = [
  126. '#type' => 'fieldset',
  127. '#title' => t('Additional Options'),
  128. '#collapsible' => TRUE,
  129. '#collapsed' => TRUE,
  130. ];
  131. $form['additional']['re_help'] = [
  132. '#type' => 'item',
  133. '#value' => t('A regular expression is an advanced method for extracting
  134. information from a string of text. Your FASTA file may contain both a
  135. human-readable name and a unique name for each sequence. If you want
  136. to import both the name and unique name for all sequences, then you
  137. must provide regular expressions so that the loader knows how to
  138. separate them. Otherwise the name and uniquename will be the same.
  139. By default, this loader will use the first word in the definition
  140. lines of the FASTA file
  141. as the name or unique name of the feature.'),
  142. ];
  143. $form['additional']['re_name'] = [
  144. '#type' => 'textfield',
  145. '#title' => t('Regular expression for the name'),
  146. '#required' => FALSE,
  147. '#description' => t('Enter the regular expression that will extract the
  148. feature name from the FASTA definition line. For example, for a
  149. defintion line with a name and unique name separated by a bar \'|\' (>seqname|uniquename),
  150. the regular expression for the name would be, "^(.*?)\|.*$". All FASTA
  151. definition lines begin with the ">" symbol. You do not need to incldue
  152. this symbol in your regular expression.'),
  153. ];
  154. $form['additional']['re_uname'] = [
  155. '#type' => 'textfield',
  156. '#title' => t('Regular expression for the unique name'),
  157. '#required' => FALSE,
  158. '#description' => t('Enter the regular expression that will extract the
  159. feature name from the FASTA definition line. For example, for a
  160. defintion line with a name and unique name separated by a bar \'|\' (>seqname|uniquename),
  161. the regular expression for the unique name would be "^.*?\|(.*)$". All FASTA
  162. definition lines begin with the ">" symbol. You do not need to incldue
  163. this symbol in your regular expression.'),
  164. ];
  165. // Advanced database cross reference options.
  166. $form['additional']['db'] = [
  167. '#type' => 'fieldset',
  168. '#title' => t('External Database Reference'),
  169. '#weight' => 6,
  170. '#collapsed' => TRUE,
  171. ];
  172. $form['additional']['db']['re_accession'] = [
  173. '#type' => 'textfield',
  174. '#title' => t('Regular expression for the accession'),
  175. '#required' => FALSE,
  176. '#description' => t('Enter the regular expression that will extract the accession for the external database for each feature from the FASTA definition line.'),
  177. '#weight' => 2,
  178. ];
  179. // get the list of databases
  180. $sql = "SELECT * FROM {db} ORDER BY name";
  181. $db_rset = chado_query($sql);
  182. $dbs = [];
  183. $dbs[''] = '';
  184. while ($db = $db_rset->fetchObject()) {
  185. $dbs[$db->db_id] = "$db->name";
  186. }
  187. $form['additional']['db']['db_id'] = [
  188. '#title' => t('External Database'),
  189. '#type' => t('select'),
  190. '#description' => t("Plese choose an external database for which these sequences have a cross reference."),
  191. '#required' => FALSE,
  192. '#options' => $dbs,
  193. '#weight' => 1,
  194. ];
  195. $form['additional']['relationship'] = [
  196. '#type' => 'fieldset',
  197. '#title' => t('Relationships'),
  198. '#weight' => 6,
  199. '#collapsed' => TRUE,
  200. ];
  201. $rels = [];
  202. $rels[''] = '';
  203. $rels['part_of'] = 'part of';
  204. $rels['derives_from'] = 'produced by (derives from)';
  205. // Advanced references options
  206. $form['additional']['relationship']['rel_type'] = [
  207. '#title' => t('Relationship Type'),
  208. '#type' => t('select'),
  209. '#description' => t("Use this option to create associations, or relationships between the
  210. features of this FASTA file and existing features in the database. For
  211. example, to associate a FASTA file of peptides to existing genes or transcript sequence,
  212. select the type 'produced by'. For a CDS sequences select the type 'part of'"),
  213. '#required' => FALSE,
  214. '#options' => $rels,
  215. '#weight' => 5,
  216. ];
  217. $form['additional']['relationship']['re_subject'] = [
  218. '#type' => 'textfield',
  219. '#title' => t('Regular expression for the parent'),
  220. '#required' => FALSE,
  221. '#description' => t('Enter the regular expression that will extract the unique
  222. name needed to identify the existing sequence for which the
  223. relationship type selected above will apply. If no regular
  224. expression is provided, the parent unique name must be the
  225. same as the loaded feature name.'),
  226. '#weight' => 6,
  227. ];
  228. $form['additional']['relationship']['parent_type'] = [
  229. '#type' => 'textfield',
  230. '#title' => t('Parent Type'),
  231. '#required' => FALSE,
  232. '#description' => t('Please enter the Sequence Ontology term for the parent. For example
  233. if the FASTA file being loaded is a set of proteins that are
  234. products of genes, then use the SO term \'gene\' or \'transcript\' or equivalent. However,
  235. this type must match the type for already loaded features.'),
  236. '#weight' => 7,
  237. ];
  238. return $form;
  239. }
  240. /**
  241. * @see TripalImporter::formValidate()
  242. */
  243. public function formValidate($form, &$form_state) {
  244. $organism_id = $form_state['values']['organism_id'];
  245. $type = trim($form_state['values']['seqtype']);
  246. $method = trim($form_state['values']['method']);
  247. $match_type = trim($form_state['values']['match_type']);
  248. $re_name = trim($form_state['values']['re_name']);
  249. $re_uname = trim($form_state['values']['re_uname']);
  250. $re_accession = trim($form_state['values']['re_accession']);
  251. $db_id = $form_state['values']['db_id'];
  252. $rel_type = $form_state['values']['rel_type'];
  253. $re_subject = trim($form_state['values']['re_subject']);
  254. $parent_type = trim($form_state['values']['parent_type']);
  255. if ($method == 0) {
  256. $method = 'Insert only';
  257. }
  258. if ($method == 1) {
  259. $method = 'Update only';
  260. }
  261. if ($method == 2) {
  262. $method = 'Insert and update';
  263. }
  264. if ($match_type == 0) {
  265. $match_type = 'Name';
  266. }
  267. if ($match_type == 1) {
  268. $match_type = 'Unique name';
  269. }
  270. if ($re_name and !$re_uname and strcmp($match_type, 'Unique name') == 0) {
  271. form_set_error('re_uname', t("You must provide a regular expression to identify the sequence unique name"));
  272. }
  273. if (!$re_name and $re_uname and strcmp($match_type, 'Name') == 0) {
  274. form_set_error('re_name', t("You must provide a regular expression to identify the sequence name"));
  275. }
  276. // make sure if a relationship is specified that all fields are provided.
  277. if (($rel_type or $re_subject) and !$parent_type) {
  278. form_set_error('parent_type', t("Please provide a SO term for the parent"));
  279. }
  280. if (($parent_type or $re_subject) and !$rel_type) {
  281. form_set_error('rel_type', t("Please select a relationship type"));
  282. }
  283. // make sure if a database is specified that all fields are provided
  284. if ($db_id and !$re_accession) {
  285. form_set_error('re_accession', t("Please provide a regular expression for the accession"));
  286. }
  287. if ($re_accession and !$db_id) {
  288. form_set_error('db_id', t("Please select a database"));
  289. }
  290. // Check to make sure the regexps are valid.
  291. if ($re_name && @preg_match("/$re_name/", null) === false) {
  292. form_set_error('re_name', t("please provide a valid regular expression for the feature name."));
  293. }
  294. if ($re_uname && @preg_match("/$re_uname/", null) === false) {
  295. form_set_error('re_uname', t("please provide a valid regular expression for the feature unique name."));
  296. }
  297. if ($re_accession && @preg_match("/$re_accession/", null) === false) {
  298. form_set_error('re_accession', t("please provide a valid regular expression for the external database accession."));
  299. }
  300. if ($re_subject && @preg_match("/$re_subject/", null) === false) {
  301. form_set_error('re_subject', t("please provide a valid regular expression for the relationship parent."));
  302. }
  303. // check to make sure the types exists
  304. $cvtermsql = "
  305. SELECT CVT.cvterm_id
  306. FROM {cvterm} CVT
  307. INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  308. LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
  309. WHERE cv.name = :cvname and (CVT.name = :name or CVTS.synonym = :synonym)
  310. ";
  311. $cvterm = chado_query($cvtermsql,
  312. [
  313. ':cvname' => 'sequence',
  314. ':name' => $type,
  315. ':synonym' => $type,
  316. ])->fetchObject();
  317. if (!$cvterm) {
  318. form_set_error('type', t("The Sequence Ontology (SO) term selected for the sequence type is not available in the database. Please check spelling or select another."));
  319. }
  320. if ($rel_type) {
  321. $cvterm = chado_query($cvtermsql, [
  322. ':cvname' => 'sequence',
  323. ':name' => $parent_type,
  324. ':synonym' => $parent_type,
  325. ])->fetchObject();
  326. if (!$cvterm) {
  327. form_set_error('parent_type', t("The Sequence Ontology (SO) term selected for the parent relationship is not available in the database. Please check spelling or select another."));
  328. }
  329. }
  330. }
  331. /**
  332. * @see TripalImporter::run()
  333. */
  334. public function run() {
  335. $arguments = $this->arguments['run_args'];
  336. $file_path = $this->arguments['files'][0]['file_path'];
  337. $organism_id = $arguments['organism_id'];
  338. $type = $arguments['seqtype'];
  339. $method = $arguments['method'];
  340. $match_type = $arguments['match_type'];
  341. $re_name = $arguments['re_name'];
  342. $re_uname = $arguments['re_uname'];
  343. $re_accession = $arguments['re_accession'];
  344. $db_id = $arguments['db_id'];
  345. $rel_type = $arguments['rel_type'];
  346. $re_subject = $arguments['re_subject'];
  347. $parent_type = $arguments['parent_type'];
  348. $method = $arguments['method'];
  349. $analysis_id = $arguments['analysis_id'];
  350. $match_type = $arguments['match_type'];
  351. if ($method == 0) {
  352. $method = 'Insert only';
  353. }
  354. if ($method == 1) {
  355. $method = 'Update only';
  356. }
  357. if ($method == 2) {
  358. $method = 'Insert and update';
  359. }
  360. if ($match_type == 0) {
  361. $match_type = 'Name';
  362. }
  363. if ($match_type == 1) {
  364. $match_type = 'Unique name';
  365. }
  366. $this->loadFasta($file_path, $organism_id, $type, $re_name, $re_uname, $re_accession,
  367. $db_id, $rel_type, $re_subject, $parent_type, $method, $analysis_id,
  368. $match_type);
  369. }
  370. /**
  371. * Load a fasta file.
  372. *
  373. * @param $file_path
  374. * The full path to the fasta file to load.
  375. * @param $organism_id
  376. * The organism_id of the organism these features are from.
  377. * @param $type
  378. * The type of features contained in the fasta file.
  379. * @param $re_name
  380. * The regular expression to extract the feature.name from the fasta header.
  381. * @param $re_uname
  382. * The regular expression to extract the feature.uniquename from the fasta
  383. * header.
  384. * @param $re_accession
  385. * The regular expression to extract the accession of the feature.dbxref_id.
  386. * @param $db_id
  387. * The database ID of the above accession.
  388. * @param $rel_type
  389. * The type of relationship when creating a feature_relationship between
  390. * this feature (object) and an extracted subject.
  391. * @param $re_subject
  392. * The regular expression to extract the uniquename of the feature to be
  393. * the subject of the above specified relationship.
  394. * @param $parent_type
  395. * The type of the parent feature.
  396. * @param $method
  397. * The method of feature adding. (ie: 'Insert only', 'Update only',
  398. * 'Insert and update').
  399. * @param $analysis_id
  400. * The analysis_id to associate the features in this fasta file with.
  401. * @param $match_type
  402. * Whether to match existing features based on the 'Name' or 'Unique name'.
  403. */
  404. private function loadFasta($file_path, $organism_id, $type, $re_name, $re_uname, $re_accession,
  405. $db_id, $rel_type, $re_subject, $parent_type, $method, $analysis_id, $match_type) {
  406. // First get the type for this sequence.
  407. $cvtermsql = "
  408. SELECT CVT.cvterm_id
  409. FROM {cvterm} CVT
  410. INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  411. LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
  412. WHERE cv.name = :cvname and (CVT.name = :name or CVTS.synonym = :synonym)
  413. ";
  414. $cvterm = chado_query($cvtermsql, [
  415. ':cvname' => 'sequence',
  416. ':name' => $type,
  417. ':synonym' => $type,
  418. ])->fetchObject();
  419. if (!$cvterm) {
  420. $this->logMessage("Cannot find the term type: '!type'", ['!type' => $type], TRIPAL_ERROR);
  421. return 0;
  422. }
  423. // Second, if there is a parent type then get that.
  424. $parentcvterm = NULL;
  425. if ($parent_type) {
  426. $parentcvterm = chado_query($cvtermsql, [
  427. ':cvname' => 'sequence',
  428. ':name' => $parent_type,
  429. ':synonym' => $parent_type,
  430. ])->fetchObject();
  431. if (!$parentcvterm) {
  432. $this->logMessage("Cannot find the parent term type: '!type'",
  433. ['!type' => $parentcvterm], TRIPAL_ERROR);
  434. return 0;
  435. }
  436. }
  437. // Third, if there is a relationship type then get that.
  438. $relcvterm = NULL;
  439. if ($rel_type) {
  440. $relcvterm = chado_query($cvtermsql, [
  441. ':cvname' => 'sequence',
  442. ':name' => $rel_type,
  443. ':synonym' => $rel_type,
  444. ])->fetchObject();
  445. if (!$relcvterm) {
  446. $this->logMessage("Cannot find the relationship term type: '!type'",
  447. ['!type' => $relcvterm], TRIPAL_ERROR);
  448. return 0;
  449. }
  450. }
  451. // We need to get the table schema to make sure we don't overrun the
  452. // size of fields with what our regular expressions retrieve
  453. $feature_tbl = chado_get_schema('feature');
  454. $dbxref_tbl = chado_get_schema('dbxref');
  455. $this->logMessage(t("Step 1: Finding sequences..."));
  456. $filesize = filesize($file_path);
  457. $fh = fopen($file_path, 'r');
  458. if (!$fh) {
  459. throw new Exception(t("Cannot open file: !dfile", ['!dfile' => $file_path]));
  460. }
  461. $num_read = 0;
  462. // Iterate through the lines of the file. Keep a record for
  463. // where in the file each line is at for later import.
  464. $seqs = [];
  465. $num_seqs = 0;
  466. $prev_pos = 0;
  467. $set_start = FALSE;
  468. $i = 0;
  469. while ($line = fgets($fh)) {
  470. $i++;
  471. $num_read += strlen($line);
  472. // If we encounter a definition line then get the name, uniquename,
  473. // accession and relationship subject from the definition line.
  474. if (preg_match('/^>/', $line)) {
  475. // Remove the > symbol from the defline.
  476. $defline = preg_replace("/^>/", '', $line);
  477. // Get the feature name if a regular expression is provided.
  478. $name = "";
  479. if ($re_name) {
  480. if (!preg_match("/$re_name/", $defline, $matches)) {
  481. $this->logMessage("Regular expression for the feature name finds nothing. Line !line.",
  482. ['!line' => $i], TRIPAL_ERROR);
  483. }
  484. elseif (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
  485. $this->logMessage("Regular expression retrieves a value too long for the feature name. Line !line.",
  486. ['!line' => $i], TRIPAL_WARNING);
  487. }
  488. else {
  489. $name = trim($matches[1]);
  490. }
  491. }
  492. // If the match_type is name and no regular expression was provided
  493. // then use the first word as the name, otherwise we don't set the name.
  494. elseif (strcmp($match_type, 'Name') == 0) {
  495. if (preg_match("/^\s*(.*?)[\s\|].*$/", $defline, $matches)) {
  496. if (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
  497. $this->logMessage("Regular expression retrieves a feature name too long for the feature name. Line !line.",
  498. ['!line' => $i], TRIPAL_WARNING);
  499. }
  500. else {
  501. $name = trim($matches[1]);
  502. }
  503. }
  504. else {
  505. $this->logMessage("Cannot find a feature name. Line !line.", ['!line' => $i], TRIPAL_WARNING);
  506. }
  507. }
  508. // Get the feature uniquename if a regular expression is provided.
  509. $uname = "";
  510. if ($re_uname) {
  511. if (!preg_match("/$re_uname/", $defline, $matches)) {
  512. $this->logMessage("Regular expression for the feature unique name finds nothing. Line !line.",
  513. ['!line' => $i], TRIPAL_ERROR);
  514. }
  515. $uname = trim($matches[1]);
  516. }
  517. // If the match_type is name and no regular expression was provided
  518. // then use the first word as the name, otherwise, we don't set the
  519. // uniquename.
  520. elseif (strcmp($match_type, 'Unique name') == 0) {
  521. if (preg_match("/^\s*(.*?)[\s\|].*$/", $defline, $matches)) {
  522. $uname = trim($matches[1]);
  523. }
  524. else {
  525. $this->logMessage("Cannot find a feature unique name. Line !line.",
  526. ['!line' => $i], TRIPAL_ERROR);
  527. }
  528. }
  529. // Get the accession if a regular expression is provided.
  530. $accession = "";
  531. if (!empty($re_accession)) {
  532. preg_match("/$re_accession/", $defline, $matches);
  533. if (strlen($matches[1]) > $dbxref_tbl['fields']['accession']['length']) {
  534. tripal_report_error('trp-fasta', TRIPAL_WARNING, "WARNING: Regular expression retrieves an accession too long for the feature name. " .
  535. "Cannot add cross reference. Line %line.", [
  536. '%line' => $i,
  537. ]);
  538. }
  539. else {
  540. $accession = trim($matches[1]);
  541. }
  542. }
  543. // Get the relationship subject
  544. $subject = $uname ? $uname : "";
  545. if (!empty($re_subject)) {
  546. preg_match("/$re_subject/", $line, $matches);
  547. $subject = trim($matches[1]);
  548. }
  549. // Add the details to the sequence.
  550. $seqs[$num_seqs] = [
  551. 'name' => $name,
  552. 'uname' => $uname,
  553. 'accession' => $accession,
  554. 'subject' => $subject,
  555. 'seq_start' => ftell($fh),
  556. ];
  557. $set_start = TRUE;
  558. // If this isn't the first sequence, then we want to specify where
  559. // the previous sequence ended.
  560. if ($num_seqs > 0) {
  561. $seqs[$num_seqs - 1]['seq_end'] = $prev_pos;
  562. }
  563. $num_seqs++;
  564. }
  565. // Keep the current file position so we can use it to set the sequence
  566. // ending position
  567. $prev_pos = ftell($fh);
  568. }
  569. // Set the end position for the last sequence.
  570. $seqs[$num_seqs - 1]['seq_end'] = $num_read - strlen($line);
  571. // Now that we know where the sequences are in the file we need to add them.
  572. $this->logMessage("Step 2: Importing sequences...");
  573. $this->logMessage("Found !num_seqs sequence(s).", ['!num_seqs' => $num_seqs]);
  574. $this->setTotalItems($num_seqs);
  575. $this->setItemsHandled(0);
  576. for ($j = 0; $j < $num_seqs; $j++) {
  577. $seq = $seqs[$j];
  578. //$this->logMessage("Importing !seqname.", array('!seqname' => $seq['name']));
  579. $source = NULL;
  580. $this->loadFastaFeature($fh, $seq['name'], $seq['uname'], $db_id,
  581. $seq['accession'], $seq['subject'], $rel_type, $parent_type,
  582. $analysis_id, $organism_id, $cvterm, $source, $method, $re_name,
  583. $match_type, $parentcvterm, $relcvterm, $seq['seq_start'],
  584. $seq['seq_end']);
  585. $this->setItemsHandled($j);
  586. }
  587. fclose($fh);
  588. $this->setItemsHandled($num_seqs);
  589. }
  590. /**
  591. * A helper function for loadFasta() to load a single feature
  592. *
  593. * @ingroup fasta_loader
  594. */
  595. private function loadFastaFeature($fh, $name, $uname, $db_id, $accession, $parent,
  596. $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm, $source, $method, $re_name,
  597. $match_type, $parentcvterm, $relcvterm, $seq_start, $seq_end) {
  598. // Check to see if this feature already exists if the match_type is 'Name'.
  599. if (strcmp($match_type, 'Name') == 0) {
  600. $values = [
  601. 'organism_id' => $organism_id,
  602. 'name' => $name,
  603. 'type_id' => $cvterm->cvterm_id,
  604. ];
  605. $results = chado_select_record('feature', [
  606. 'feature_id',
  607. ], $values);
  608. if (count($results) > 1) {
  609. $this->logMessage("Multiple features exist with the name '!name' of type '!type' for the organism. skipping",
  610. ['!name' => $name, '!type' => $cvterm->name], TRIPAL_ERROR);
  611. return 0;
  612. }
  613. if (count($results) == 1) {
  614. $feature = $results[0];
  615. }
  616. }
  617. // Check if this feature already exists if the match_type is 'Unique Name'.
  618. if (strcmp($match_type, 'Unique name') == 0) {
  619. $values = [
  620. 'organism_id' => $organism_id,
  621. 'uniquename' => $uname,
  622. 'type_id' => $cvterm->cvterm_id,
  623. ];
  624. $results = chado_select_record('feature', ['feature_id'], $values);
  625. if (count($results) > 1) {
  626. $this->logMessage("Multiple features exist with the name '!name' of type '!type' for the organism. skipping",
  627. ['!name' => $name, '!type' => $cvterm->name], TRIPAL_WARNING);
  628. return 0;
  629. }
  630. if (count($results) == 1) {
  631. $feature = $results[0];
  632. }
  633. // If the feature exists but this is an "insert only" then skip.
  634. if (isset($feature) and (strcmp($method, 'Insert only') == 0)) {
  635. $this->logMessage("Feature already exists '!name' ('!uname') while matching on !type. Skipping insert.",
  636. [
  637. '!name' => $name,
  638. '!uname' => $uname,
  639. '!type' => drupal_strtolower($match_type),
  640. ], TRIPAL_WARNING);
  641. return 0;
  642. }
  643. }
  644. // If we don't have a feature and we're doing an insert then do the insert.
  645. $inserted = 0;
  646. if (!isset($feature) and (strcmp($method, 'Insert only') == 0 or strcmp($method, 'Insert and update') == 0)) {
  647. // If we have a unique name but not a name then set them to be the same
  648. if (!$uname) {
  649. $uname = $name;
  650. }
  651. elseif (!$name) {
  652. $name = $uname;
  653. }
  654. // Insert the feature record.
  655. $values = [
  656. 'organism_id' => $organism_id,
  657. 'name' => $name,
  658. 'uniquename' => $uname,
  659. 'type_id' => $cvterm->cvterm_id,
  660. ];
  661. $success = chado_insert_record('feature', $values);
  662. if (!$success) {
  663. $this->logMessage("Failed to insert feature '!name (!uname)'", [
  664. '!name' => $name,
  665. '!uname' => $uname,
  666. ], TRIPAL_ERROR);
  667. return 0;
  668. }
  669. // now get the feature we just inserted
  670. $values = [
  671. 'organism_id' => $organism_id,
  672. 'uniquename' => $uname,
  673. 'type_id' => $cvterm->cvterm_id,
  674. ];
  675. $results = chado_select_record('feature', ['feature_id'], $values);
  676. if (count($results) == 1) {
  677. $inserted = 1;
  678. $feature = $results[0];
  679. }
  680. else {
  681. $this->logMessage("Failed to retreive newly inserted feature '!name (!uname)'", [
  682. '!name' => $name,
  683. '!uname' => $uname,
  684. ], TRIPAL_ERRORR);
  685. return 0;
  686. }
  687. // Add the residues for this feature
  688. $this->loadFastaResidues($fh, $feature->feature_id, $seq_start, $seq_end);
  689. }
  690. // if we don't have a feature and the user wants to do an update then fail
  691. if (!isset($feature) and (strcmp($method, 'Update only') == 0 or strcmp($method, 'Insert and update') == 0)) {
  692. $this->logMessage("Failed to find feature '!name' ('!uname') while matching on " . drupal_strtolower($match_type) . ".",
  693. ['!name' => $name, '!uname' => $uname], TRIPAL_ERROR);
  694. return 0;
  695. }
  696. // if we do have a feature and this is an update then proceed with the update
  697. if (isset($feature) and !$inserted and (strcmp($method, 'Update only') == 0 or strcmp($method, 'Insert and update') == 0)) {
  698. // if the user wants to match on the Name field
  699. if (strcmp($match_type, 'Name') == 0) {
  700. // if we're matching on the name but do not have a unique name then we
  701. // don't want to update the uniquename.
  702. $values = [];
  703. if ($uname) {
  704. // First check to make sure that by changing the unique name of this
  705. // feature that we won't conflict with another existing feature of
  706. // the same name
  707. $values = [
  708. 'organism_id' => $organism_id,
  709. 'uniquename' => $uname,
  710. 'type_id' => $cvterm->cvterm_id,
  711. ];
  712. $results = chado_select_record('feature', ['feature_id'], $values);
  713. if (count($results) > 0) {
  714. $this->logMessage("Cannot update the feature '!name' with a uniquename of '!uname' and type of '!type' as it " .
  715. "conflicts with an existing feature with the same uniquename and type.",
  716. [
  717. '!name' => $name,
  718. '!uname' => $uname,
  719. '!type' => $cvterm->name,
  720. ], TRIPAL_ERROR);
  721. return 0;
  722. }
  723. // the changes to the uniquename don't conflict so proceed with the update
  724. $values = ['uniquename' => $uname];
  725. $match = [
  726. 'name' => $name,
  727. 'organism_id' => $organism_id,
  728. 'type_id' => $cvterm->cvterm_id,
  729. ];
  730. // perform the update
  731. $success = chado_update_record('feature', $match, $values);
  732. if (!$success) {
  733. $this->logMessage("Failed to update feature '!name' ('!name')",
  734. ['!name' => $name, '!uiname' => $uname], TRIPAL_ERROR);
  735. return 0;
  736. }
  737. }
  738. }
  739. // If the user wants to match on the unique name field.
  740. if (strcmp($match_type, 'Unique name') == 0) {
  741. // If we're matching on the uniquename and have a new name then
  742. // we want to update the name.
  743. $values = [];
  744. if ($name) {
  745. $values = ['name' => $name];
  746. $match = [
  747. 'uniquename' => $uname,
  748. 'organism_id' => $organism_id,
  749. 'type_id' => $cvterm->cvterm_id,
  750. ];
  751. $success = chado_update_record('feature', $match, $values);
  752. if (!$success) {
  753. $this->logMessage("Failed to update feature '!name' ('!name')",
  754. ['!name' => $name, '!uiname' => $uname], TRIPAL_ERROR);
  755. return 0;
  756. }
  757. }
  758. }
  759. }
  760. // Update the residues for this feature
  761. $this->loadFastaResidues($fh, $feature->feature_id, $seq_start, $seq_end);
  762. // add in the analysis link
  763. if ($analysis_id) {
  764. // if the association doens't already exist then add one
  765. $values = [
  766. 'analysis_id' => $analysis_id,
  767. 'feature_id' => $feature->feature_id,
  768. ];
  769. $results = chado_select_record('analysisfeature', ['analysisfeature_id'], $values);
  770. if (count($results) == 0) {
  771. $success = chado_insert_record('analysisfeature', $values);
  772. if (!$success) {
  773. $this->logMessage("Failed to associate analysis and feature '!name' ('!name')",
  774. ['!name' => $name, '!uname' => $uname], TRIPAL_ERROR);
  775. return 0;
  776. }
  777. }
  778. }
  779. // now add the database cross reference
  780. if ($db_id) {
  781. // check to see if this accession reference exists, if not add it
  782. $values = [
  783. 'db_id' => $db_id,
  784. 'accession' => $accession,
  785. ];
  786. $results = chado_select_record('dbxref', ['dbxref_id'], $values);
  787. // if the accession doesn't exist then add it
  788. if (count($results) == 0) {
  789. $results = chado_insert_record('dbxref', $values);
  790. if (!$results) {
  791. $this->logMessage("Failed to add database accession '!accession'",
  792. ['!accession' => $accession], TRIPAL_ERROR);
  793. return 0;
  794. }
  795. $results = chado_select_record('dbxref', ['dbxref_id'], $values);
  796. if (count($results) == 1) {
  797. $dbxref = $results[0];
  798. }
  799. else {
  800. $this->logMessage("Failed to retreive newly inserted dbxref '!name (!uname)'",
  801. ['!name' => $name, '!uname' => $uname], TRIPAL_ERROR);
  802. return 0;
  803. }
  804. }
  805. else {
  806. $dbxref = $results[0];
  807. }
  808. // check to see if the feature dbxref record exists if not, then add it
  809. $values = [
  810. 'feature_id' => $feature->feature_id,
  811. 'dbxref_id' => $dbxref->dbxref_id,
  812. ];
  813. $results = chado_select_record('feature_dbxref', ['feature_dbxref_id'], $values);
  814. if (count($results) == 0) {
  815. $success = chado_insert_record('feature_dbxref', $values);
  816. if (!$success) {
  817. $this->logMessage("Failed to add associate database accession '!accession' with feature",
  818. ['!accession' => $accession], TRIPAL_ERROR);
  819. return 0;
  820. }
  821. }
  822. }
  823. // now add in the relationship if one exists. If not, then add it
  824. if ($rel_type) {
  825. $values = [
  826. 'organism_id' => $organism_id,
  827. 'uniquename' => $parent,
  828. 'type_id' => $parentcvterm->cvterm_id,
  829. ];
  830. $results = chado_select_record('feature', ['feature_id'], $values);
  831. if (count($results) != 1) {
  832. $this->logMessage("Cannot find a unique feature for the parent '!parent' of type '!type' for the feature.",
  833. ['!parent' => $parent, '!type' => $parent_type], TRIPAL_ERROR);
  834. return 0;
  835. }
  836. $parent_feature = $results[0];
  837. // check to see if the relationship already exists if not then add it
  838. $values = [
  839. 'subject_id' => $feature->feature_id,
  840. 'object_id' => $parent_feature->feature_id,
  841. 'type_id' => $relcvterm->cvterm_id,
  842. ];
  843. $results = chado_select_record('feature_relationship', ['feature_relationship_id'], $values);
  844. if (count($results) == 0) {
  845. $success = chado_insert_record('feature_relationship', $values);
  846. if (!$success) {
  847. $this->logMessage("Failed to add associate database accession '!accession' with feature",
  848. ['!accession' => $accession], TRIPAL_ERROR);
  849. return 0;
  850. }
  851. }
  852. }
  853. }
  854. /**
  855. * Adds the residues column to the feature.
  856. *
  857. * This function seeks to the proper location in the file for the sequence
  858. * and reads in chunks of sequence and appends them to the feature.residues
  859. * column in the database.
  860. *
  861. * @param $fh
  862. * @param $feature_id
  863. * @param $seq_start
  864. * @param $seq_end
  865. */
  866. private function loadFastaResidues($fh, $feature_id, $seq_start, $seq_end) {
  867. // First position the file at the beginning of the sequence
  868. fseek($fh, $seq_start, SEEK_SET);
  869. $chunk_size = 100000000;
  870. $chunk = '';
  871. $seqlen = ($seq_end - $seq_start);
  872. $num_read = 0;
  873. $total_seq_size = 0;
  874. // First, make sure we don't have a null in the residues
  875. $sql = "UPDATE {feature} SET residues = '' WHERE feature_id = :feature_id";
  876. chado_query($sql, [':feature_id' => $feature_id]);
  877. // Read in the lines until we reach the end of the sequence. Once we
  878. // get a specific bytes read then append the sequence to the one in the
  879. // database.
  880. $partial_seq_size = 0;
  881. $chunk_intv_read = 0;
  882. while ($line = fgets($fh)) {
  883. $num_read += strlen($line) + 1;
  884. $chunk_intv_read += strlen($line) + 1;
  885. $partial_seq_size += strlen($line);
  886. $chunk .= trim($line);
  887. // If we've read in enough of the sequence then append it to the database.
  888. if ($chunk_intv_read >= $chunk_size) {
  889. $sql = "
  890. UPDATE {feature}
  891. SET residues = residues || :chunk
  892. WHERE feature_id = :feature_id
  893. ";
  894. $success = chado_query($sql, [
  895. ':feature_id' => $feature_id,
  896. ':chunk' => $chunk,
  897. ]);
  898. if (!$success) {
  899. return FALSE;
  900. }
  901. $total_seq_size += strlen($chunk);
  902. $chunk = '';
  903. $chunk_intv_read = 0;
  904. }
  905. // If we've reached the end of the sequence then break out of the loop
  906. if (ftell($fh) == $seq_end) {
  907. break;
  908. }
  909. }
  910. // write the last bit of sequence if it remains
  911. if (strlen($chunk) > 0) {
  912. $sql = "
  913. UPDATE {feature}
  914. SET residues = residues || :chunk
  915. WHERE feature_id = :feature_id
  916. ";
  917. $success = chado_query($sql, [
  918. ':feature_id' => $feature_id,
  919. ':chunk' => $chunk,
  920. ]);
  921. if (!$success) {
  922. return FALSE;
  923. }
  924. $total_seq_size += $partial_seq_size;
  925. $partial_seq_size = 0;
  926. $chunk = '';
  927. $chunk_intv_read = 0;
  928. }
  929. // Now update the seqlen and md5checksum fields
  930. $sql = "UPDATE {feature} SET seqlen = char_length(residues), md5checksum = md5(residues) WHERE feature_id = :feature_id";
  931. chado_query($sql, [':feature_id' => $feature_id]);
  932. }
  933. }