tripal_feature.fasta_loader.inc 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961
  1. <?php
  2. /**
  3. * @file
  4. * Provides fasta loading functionality. Creates features based on their specification
  5. * in a fasta file.
  6. */
  7. /**
  8. * @defgroup fasta_loader FASTA Feature Loader
  9. * @ingroup tripal_feature
  10. * @{
  11. * Provides fasta loading functionality. Creates features based on their specification
  12. * in a fasta file.
  13. * @}
  14. *
  15. */
  16. /**
  17. * The form to submit a fasta loading job
  18. *
  19. * @ingroup fasta_loader
  20. */
  21. function tripal_feature_fasta_load_form() {
  22. $form['fasta_file']= array(
  23. '#type' => 'textfield',
  24. '#title' => t('FASTA File'),
  25. '#description' => t('Please enter the full system path for the FASTA file, or a path within the Drupal
  26. installation (e.g. /sites/default/files/xyz.obo). The path must be accessible to the
  27. server on which this Drupal instance is running.'),
  28. '#required' => TRUE,
  29. );
  30. // get the list of organisms
  31. $sql = "SELECT * FROM {organism} ORDER BY genus, species";
  32. $org_rset = chado_query($sql);
  33. $organisms = array();
  34. $organisms[''] = '';
  35. while ($organism = $org_rset->fetchObject()) {
  36. $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
  37. }
  38. $form['organism_id'] = array(
  39. '#title' => t('Organism'),
  40. '#type' => t('select'),
  41. '#description' => t("Choose the organism to which these sequences are associated"),
  42. '#required' => TRUE,
  43. '#options' => $organisms,
  44. );
  45. // get the sequence ontology CV ID
  46. $values = array('name' => 'sequence');
  47. $cv = chado_select_record('cv', array('cv_id'), $values);
  48. $cv_id = $cv[0]->cv_id;
  49. $form['seqtype']= array(
  50. '#type' => 'textfield',
  51. '#title' => t('Sequence Type'),
  52. '#required' => TRUE,
  53. '#description' => t('Please enter the Sequence Ontology (SO) term name that describes the sequences in the FASTA file (e.g. gene, mRNA, protein, etc...)'),
  54. '#autocomplete_path' => "admin/tripal/chado/tripal_cv/cvterm/auto_name/$cv_id",
  55. );
  56. $form['method']= array(
  57. '#type' => 'radios',
  58. '#title' => 'Method',
  59. '#required' => TRUE,
  60. '#options' => array(
  61. t('Insert only'),
  62. t('Update only'),
  63. t('Insert and update'),
  64. ),
  65. '#description' => t('Select how features in the FASTA file are handled.
  66. Select "Insert only" to insert the new features. If a feature already
  67. exists with the same name or unique name and type then it is skipped.
  68. Select "Update only" to only update featues that already exist in the
  69. database. Select "Insert and Update" to insert features that do
  70. not exist and upate those that do.'),
  71. '#default_value' => 2,
  72. );
  73. $form['match_type']= array(
  74. '#type' => 'radios',
  75. '#title' => 'Name Match Type',
  76. '#required' => TRUE,
  77. '#options' => array(
  78. t('Name'),
  79. t('Unique name'),
  80. ),
  81. '#description' => t('Used for "updates only" or "insert and update" methods. Not required if method type is "insert".
  82. Feature data is stored in Chado with both a human-readable
  83. name and a unique name. If the features in your FASTA file are uniquely identified using
  84. a human-readable name then select the "Name" button. If your features are
  85. uniquely identified using the unique name then select the "Unique name" button. If you
  86. loaded your features first using the GFF loader then the unique name of each
  87. features were indicated by the "ID=" attribute and the name by the "Name=" attribute.
  88. By default, the FASTA loader will use the first word (character string
  89. before the first space) as the name for your feature. If
  90. this does not uniquely identify your feature consider specifying a regular expression in the advanced section below.
  91. Additionally, you may import both a name and a unique name for each sequence using the advanced options.'),
  92. '#default_value' => 1,
  93. );
  94. $form['analysis'] = array(
  95. '#type' => 'fieldset',
  96. '#title' => t('Analysis Used to Derive Features'),
  97. '#collapsed' => TRUE
  98. );
  99. $form['analysis']['desc'] = array(
  100. '#markup' => t("Why specify an analysis for a data load? All data comes
  101. from some place, even if downloaded from Genbank. By specifying
  102. analysis details for all data uploads, it allows an end user to reproduce the
  103. data set, but at least indicates the source of the data."),
  104. );
  105. // get the list of organisms
  106. $sql = "SELECT * FROM {analysis} ORDER BY name";
  107. $org_rset = chado_query($sql);
  108. $analyses = array();
  109. $analyses[''] = '';
  110. while ($analysis = $org_rset->fetchObject()) {
  111. $analyses[$analysis->analysis_id] = "$analysis->name ($analysis->program $analysis->programversion, $analysis->sourcename)";
  112. }
  113. $form['analysis']['analysis_id'] = array(
  114. '#title' => t('Analysis'),
  115. '#type' => t('select'),
  116. '#description' => t("Choose the analysis to which these features are associated"),
  117. '#required' => TRUE,
  118. '#options' => $analyses,
  119. );
  120. // Advanced Options
  121. $form['advanced'] = array(
  122. '#type' => 'fieldset',
  123. '#title' => t('Advanced Options'),
  124. '#collapsible' => TRUE,
  125. '#collapsed' => TRUE
  126. );
  127. $form['advanced']['re_help']= array(
  128. '#type' => 'item',
  129. '#value' => t('A regular expression is an advanced method for extracting information from a string of text.
  130. Your FASTA file may contain both a human-readable name and a unique name for each sequence.
  131. If you want to import
  132. both the name and unique name for all sequences, then you must provide regular expressions
  133. so that the loader knows how to separate them.
  134. Otherwise the name and uniquename will be the same.
  135. By default, this loader will use the first word in the definition
  136. lines of the FASTA file
  137. as the name or unique name of the feature.'),
  138. );
  139. $form['advanced']['re_name']= array(
  140. '#type' => 'textfield',
  141. '#title' => t('Regular expression for the name'),
  142. '#required' => FALSE,
  143. '#description' => t('Enter the regular expression that will extract the
  144. feature name from the FASTA definition line. For example, for a
  145. defintion line with a name and unique name separated by a bar \'|\' (>seqname|uniquename),
  146. the regular expression for the name would be, "^(.*?)\|.*$".'),
  147. );
  148. $form['advanced']['re_uname']= array(
  149. '#type' => 'textfield',
  150. '#title' => t('Regular expression for the unique name'),
  151. '#required' => FALSE,
  152. '#description' => t('Enter the regular expression that will extract the
  153. feature name from the FASTA definition line. For example, for a
  154. defintion line with a name and unique name separated by a bar \'|\' (>seqname|uniquename),
  155. the regular expression for the unique name would be "^.*?\|(.*)$").'),
  156. );
  157. // Advanced database cross-reference optoins
  158. $form['advanced']['db'] = array(
  159. '#type' => 'fieldset',
  160. '#title' => t('External Database Reference'),
  161. '#weight' => 6,
  162. '#collapsed' => TRUE
  163. );
  164. $form['advanced']['db']['re_accession']= array(
  165. '#type' => 'textfield',
  166. '#title' => t('Regular expression for the accession'),
  167. '#required' => FALSE,
  168. '#description' => t('Enter the regular expression that will extract the accession for the external database for each feature from the FASTA definition line.'),
  169. '#weight' => 2
  170. );
  171. // get the list of databases
  172. $sql = "SELECT * FROM {db} ORDER BY name";
  173. $db_rset = chado_query($sql);
  174. $dbs = array();
  175. $dbs[''] = '';
  176. while ($db = $db_rset->fetchObject()) {
  177. $dbs[$db->db_id] = "$db->name";
  178. }
  179. $form['advanced']['db']['db_id'] = array(
  180. '#title' => t('External Database'),
  181. '#type' => t('select'),
  182. '#description' => t("Plese choose an external database for which these sequences have a cross reference."),
  183. '#required' => FALSE,
  184. '#options' => $dbs,
  185. '#weight' => 1,
  186. );
  187. $form['advanced']['relationship'] = array(
  188. '#type' => 'fieldset',
  189. '#title' => t('Relationships'),
  190. '#weight' => 6,
  191. '#collapsed' => TRUE
  192. );
  193. $rels = array();
  194. $rels[''] = '';
  195. $rels['part_of'] = 'part of';
  196. $rels['derives_from'] = 'produced by';
  197. // Advanced references options
  198. $form['advanced']['relationship']['rel_type']= array(
  199. '#title' => t('Relationship Type'),
  200. '#type' => t('select'),
  201. '#description' => t("Use this option to create associations, or relationships between the
  202. features of this FASTA file and existing features in the database. For
  203. example, to associate a FASTA file of peptides to existing genes or transcript sequence,
  204. select the type 'produced by'. For a CDS sequences select the type 'part of'"),
  205. '#required' => FALSE,
  206. '#options' => $rels,
  207. '#weight' => 5,
  208. );
  209. $form['advanced']['relationship']['re_subject']= array(
  210. '#type' => 'textfield',
  211. '#title' => t('Regular expression for the parent'),
  212. '#required' => FALSE,
  213. '#description' => t('Enter the regular expression that will extract the unique
  214. name needed to identify the existing sequence for which the
  215. relationship type selected above will apply.'),
  216. '#weight' => 6
  217. );
  218. $form['advanced']['relationship']['parent_type']= array(
  219. '#type' => 'textfield',
  220. '#title' => t('Parent Type'),
  221. '#required' => FALSE,
  222. '#description' => t('Please enter the Sequence Ontology term for the parent. For example
  223. if the FASTA file being loaded is a set of proteins that are
  224. products of genes, then use the SO term \'gene\' or \'transcript\' or equivalent. However,
  225. this type must match the type for already loaded features.'),
  226. '#weight' => 7
  227. );
  228. $form['button'] = array(
  229. '#type' => 'submit',
  230. '#value' => t('Import FASTA file'),
  231. '#weight' => 10,
  232. );
  233. return $form;
  234. }
  235. /**
  236. * Validate the fasta loader job form
  237. *
  238. * @ingroup fasta_loader
  239. */
  240. function tripal_feature_fasta_load_form_validate($form, &$form_state) {
  241. $fasta_file = trim($form_state['values']['fasta_file']);
  242. $organism_id = $form_state['values']['organism_id'];
  243. $type = trim($form_state['values']['seqtype']);
  244. $method = trim($form_state['values']['method']);
  245. $match_type = trim($form_state['values']['match_type']);
  246. $re_name = trim($form_state['values']['re_name']);
  247. $re_uname = trim($form_state['values']['re_uname']);
  248. $re_accession = trim($form_state['values']['re_accession']);
  249. $db_id = $form_state['values']['db_id'];
  250. $rel_type = $form_state['values']['rel_type'];
  251. $re_subject = trim($form_state['values']['re_subject']);
  252. $parent_type = trim($form_state['values']['parent_type']);
  253. if ($method == 0) {
  254. $method = 'Insert only';
  255. }
  256. if ($method == 1) {
  257. $method = 'Update only';
  258. }
  259. if ($method == 2) {
  260. $method = 'Insert and update';
  261. }
  262. if ($match_type == 0) {
  263. $match_type = 'Name';
  264. }
  265. if ($match_type == 1) {
  266. $match_type = 'Unique name';
  267. }
  268. if ($re_name and !$re_uname and strcmp($match_type, 'Unique name')==0) {
  269. form_set_error('re_uname', t("You must provide a regular expression to identify the sequence unique name"));
  270. }
  271. if (!$re_name and $re_uname and strcmp($match_type, 'Name')==0) {
  272. form_set_error('re_name', t("You must provide a regular expression to identify the sequence name"));
  273. }
  274. // check to see if the file is located local to Drupal
  275. $fasta_file = trim($fasta_file);
  276. $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $fasta_file;
  277. if (!file_exists($dfile)) {
  278. // if not local to Drupal, the file must be someplace else, just use
  279. // the full path provided
  280. $dfile = $fasta_file;
  281. }
  282. if (!file_exists($dfile)) {
  283. form_set_error('fasta_file', t("Cannot find the file on the system. Check that the file exists or that the web server has permissions to read the file."));
  284. }
  285. // make sure if a relationship is specified that all fields are provided.
  286. if (($rel_type or $parent_type) and !$re_subject) {
  287. form_set_error('re_subject', t("Please provide a regular expression for the parent"));
  288. }
  289. if (($rel_type or $re_subject) and !$parent_type) {
  290. form_set_error('parent_type', t("Please provide a SO term for the parent"));
  291. }
  292. if (($parent_type or $re_subject) and !$rel_type) {
  293. form_set_error('rel_type', t("Please select a relationship type"));
  294. }
  295. // make sure if a database is specified that all fields are provided
  296. if ($db_id and !$re_accession) {
  297. form_set_error('re_accession', t("Please provide a regular expression for the accession"));
  298. }
  299. if ($re_accession and !$db_id) {
  300. form_set_error('db_id', t("Please select a database"));
  301. }
  302. // check to make sure the types exists
  303. $cvtermsql = "SELECT CVT.cvterm_id
  304. FROM {cvterm} CVT
  305. INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  306. LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
  307. WHERE cv.name = :cvname and (CVT.name = :name or CVTS.synonym = :synonym)";
  308. $cvterm = chado_query($cvtermsql, array(':cvname' => 'sequence', ':name' => $type, ':synonym' => $type))->fetchObject();
  309. if (!$cvterm) {
  310. form_set_error('type', t("The Sequence Ontology (SO) term selected for the sequence type is not available in the database. Please check spelling or select another."));
  311. }
  312. if ($rel_type) {
  313. $cvterm = chado_query($cvtermsql, array(':cvname' => 'sequence', ':name' => $parent_type, ':synonym' => $parent_type))->fetchObject();
  314. if (!$cvterm) {
  315. form_set_error('parent_type', t("The Sequence Ontology (SO) term selected for the parent relationship is not available in the database. Please check spelling or select another."));
  316. }
  317. }
  318. // check to make sure the 'relationship' and 'sequence' ontologies are loaded
  319. $form_state['storage']['dfile'] = $dfile;
  320. }
  321. /**
  322. * Submit a fasta loading job
  323. *
  324. * @ingroup fasta_loader
  325. */
  326. function tripal_feature_fasta_load_form_submit($form, &$form_state) {
  327. global $user;
  328. $dfile = $form_state['storage']['dfile'];
  329. $organism_id = $form_state['values']['organism_id'];
  330. $type = trim($form_state['values']['seqtype']);
  331. $method = trim($form_state['values']['method']);
  332. $match_type = trim($form_state['values']['match_type']);
  333. $re_name = trim($form_state['values']['re_name']);
  334. $re_uname = trim($form_state['values']['re_uname']);
  335. $re_accession = trim($form_state['values']['re_accession']);
  336. $db_id = $form_state['values']['db_id'];
  337. $rel_type = $form_state['values']['rel_type'];
  338. $re_subject = trim($form_state['values']['re_subject']);
  339. $parent_type = trim($form_state['values']['parent_type']);
  340. $analysis_id = $form_state['values']['analysis_id'];
  341. if ($method == 0) {
  342. $method = 'Insert only';
  343. }
  344. if ($method == 1) {
  345. $method = 'Update only';
  346. }
  347. if ($method == 2) {
  348. $method = 'Insert and update';
  349. }
  350. if ($match_type == 0) {
  351. $match_type = 'Name';
  352. }
  353. if ($match_type == 1) {
  354. $match_type = 'Unique name';
  355. }
  356. $args = array($dfile, $organism_id, $type, $re_name, $re_uname,
  357. $re_accession, $db_id, $rel_type, $re_subject, $parent_type, $method,
  358. $user->uid, $analysis_id, $match_type);
  359. $fname = preg_replace("/.*\/(.*)/", "$1", $dfile);
  360. tripal_add_job("Import FASTA file: $fname", 'tripal_feature',
  361. 'tripal_feature_load_fasta', $args, $user->uid);
  362. }
  363. /**
  364. * Actually load a fasta file. This is the function called by tripal jobs
  365. *
  366. * @param $dfile
  367. * The full path to the fasta file to load
  368. * @param $organism_id
  369. * The organism_id of the organism these features are from
  370. * @param $type
  371. * The type of features contained in the fasta file
  372. * @param $re_name
  373. * A regular expression to extract the feature.name from the fasta header
  374. * @param $re_uname
  375. * A regular expression to extract the feature.uniquename from the fasta header
  376. * @param $re_accession
  377. * A regular expression to extract the accession of the feature.dbxref_id
  378. * @param $db_id
  379. * The db_id of the above dbxref
  380. * @param $rel_type
  381. * The type of relationship when creating a feature_relationship between this
  382. * feature (object) and an extracted subject
  383. * @param $re_subject
  384. * The regular expression to extract the uniquename of the feature to be the subject
  385. * of the above specified relationship
  386. * @param $parent_type
  387. * The type of the parent feature
  388. * @param $method
  389. * The method of feature adding. (ie: 'Insert only', 'Update only', 'Insert and update')
  390. * @param $uid
  391. * The user id of the user who submitted the job
  392. * @param $analysis_id
  393. * The analysis_id to associate the features in this fasta file with
  394. * @param $match_type
  395. * Whether to match existing features based on the 'Name' or 'Unique name'
  396. * @param $job = NULL
  397. * The tripal job
  398. *
  399. * @ingroup fasta_loader
  400. */
  401. function tripal_feature_load_fasta($dfile, $organism_id, $type,
  402. $re_name, $re_uname, $re_accession, $db_id, $rel_type,
  403. $re_subject, $parent_type, $method, $uid, $analysis_id,
  404. $match_type, $job = NULL) {
  405. $transaction = db_transaction();
  406. print "\nNOTE: Loading of this GFF file is performed using a database transaction. \n" .
  407. "If the load fails or is terminated prematurely then the entire set of \n" .
  408. "insertions/updates is rolled back and will not be found in the database\n\n";
  409. try {
  410. // first get the type for this sequence
  411. $cvtermsql = "SELECT CVT.cvterm_id
  412. FROM {cvterm} CVT
  413. INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  414. LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
  415. WHERE cv.name = :cvname and (CVT.name = :name or CVTS.synonym = :synonym)";
  416. $cvterm = chado_query($cvtermsql, array(':cvname' => 'sequence', ':name' => $type, ':synonym' => $type))->fetchObject();
  417. if (!$cvterm) {
  418. tripal_report_error("T_fasta_loader", TRIPAL_ERROR, "Cannot find the term type: '%type'", array('%type' => $type));
  419. return 0;
  420. }
  421. if ($parent_type) {
  422. $parentcvterm = chado_query($cvtermsql, array(':cvname' => 'sequence', ':name' => $parent_type, ':synonym' => $parent_type))->fetchObject();
  423. if (!$parentcvterm) {
  424. tripal_report_error("T_fasta_loader", TRIPAL_ERROR, "Cannot find the paretne term type: '%type'", array('%type' => $parentcvterm));
  425. return 0;
  426. }
  427. }
  428. if ($rel_type) {
  429. $relcvterm = chado_query($cvtermsql, array(':cvname' => 'relationship', ':name' => $rel_type, ':synonym' => $rel_type))->fetchObject();
  430. if (!$relcvterm) {
  431. tripal_report_error("T_fasta_loader", TRIPAL_ERROR, "Cannot find the relationship term type: '%type'", array('%type' => $relcvterm));
  432. return 0;
  433. }
  434. }
  435. print "Opening FASTA file $dfile\n";
  436. //$lines = file($dfile, FILE_SKIP_EMPTY_LINES);
  437. $fh = fopen($dfile, 'r');
  438. if (!$fh) {
  439. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "cannot open file: %dfile", array('%dfile' => $dfile));
  440. return 0;
  441. }
  442. $filesize = filesize($dfile);
  443. $i = 0;
  444. $name = '';
  445. $uname = '';
  446. $residues = '';
  447. $interval = intval($filesize * 0.01);
  448. if ($interval < 1) {
  449. $interval = 1;
  450. }
  451. $inv_read = 0;
  452. // we need to get the table schema to make sure we don't overrun the
  453. // size of fields with what our regular expressions retrieve
  454. $feature_tbl = chado_get_schema('feature');
  455. $dbxref_tbl = chado_get_schema('dbxref');
  456. //foreach ($lines as $line_num => $line) {
  457. while ($line = fgets($fh)) {
  458. $i++; // update the line count
  459. $num_read += drupal_strlen($line);
  460. $intv_read += drupal_strlen($line);
  461. // if we encounter a definition line then get the name, uniquename,
  462. // accession and relationship subject from the definition line
  463. if (preg_match('/^>/', $line)) {
  464. // if we have a feature name then we are starting a new sequence
  465. // so lets handle the previous one before moving on
  466. if ($name or $uname) {
  467. tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id,
  468. $accession, $subject, $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm,
  469. $source, $residues, $method, $re_name, $match_type, $parentcvterm, $relcvterm);
  470. $residues = '';
  471. $name = '';
  472. $uname = '';
  473. }
  474. $line = preg_replace("/^>/", '', $line); // remove the > symbol from the defline
  475. // get the feature name
  476. if ($re_name) {
  477. if (!preg_match("/$re_name/", $line, $matches)) {
  478. tripal_report_error('trp-fasta', "ERROR: Regular expression for the feature name finds nothing. Line %line.", array('%line' => $i), 'error');
  479. }
  480. elseif (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
  481. tripal_report_error('trp-fasta', "WARNING: Regular expression retrieves a value too long for the feature name. Line %line.", array('%line' => $i), 'error');
  482. }
  483. else {
  484. $name = trim($matches[1]);
  485. }
  486. }
  487. else {
  488. // if the match_type is name and no regular expression was provided
  489. // then use the first word as the name, otherwise we don't set the name
  490. if (strcmp($match_type, 'Name')==0) {
  491. if (preg_match("/^\s*(.*?)[\s\|].*$/", $line, $matches)) {
  492. if (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
  493. tripal_report_error('trp-fasta', "WARNING: Regular expression retrieves a feature name too long for the feature name. Line %line.", array('%line' => $i), 'error');
  494. }
  495. else {
  496. $name = trim($matches[1]);
  497. }
  498. }
  499. else {
  500. tripal_report_error('trp-fasta', "ERROR: Cannot find a feature name. Line %line.", array('%line' => $i), 'error');
  501. }
  502. }
  503. }
  504. // get the feature unique name
  505. if ($re_uname) {
  506. if (!preg_match("/$re_uname/", $line, $matches)) {
  507. tripal_report_error('trp-fasta', "ERROR: Regular expression for the feature unique name finds nothing. Line %line.", array('%line' => $i), 'error');
  508. }
  509. $uname = trim($matches[1]);
  510. }
  511. else {
  512. // if the match_type is name and no regular expression was provided
  513. // then use the first word as the name, otherwise, we don't set the unqiuename
  514. if (strcmp($match_type, 'Unique name')==0) {
  515. if (preg_match("/^\s*(.*?)[\s\|].*$/", $line, $matches)) {
  516. $uname = trim($matches[1]);
  517. }
  518. else {
  519. tripal_report_error('trp-fasta', "ERROR: Cannot find a feature unique name. Line %line.", array('%line' => $i), 'error');
  520. }
  521. }
  522. }
  523. // get the accession
  524. preg_match("/$re_accession/", $line, $matches);
  525. if (strlen($matches[1]) > $dbxref_tbl['fields']['accession']['length']) {
  526. tripal_report_error('trp-fasta', "WARNING: Regular expression retrieves an accession too long for the feature name. Cannot add cross reference. Line %line.", array('%line' => $i), 'warning');
  527. }
  528. else {
  529. $accession = trim($matches[1]);
  530. }
  531. // get the relationship subject
  532. preg_match("/$re_subject/", $line, $matches);
  533. $subject = trim($matches[1]);
  534. }
  535. else {
  536. $residues .= trim($line);
  537. // update the job status every % features
  538. if ($job and $intv_read >= $interval) {
  539. $intv_read = 0;
  540. $percent = sprintf("%.2f", ($num_read / $filesize) * 100);
  541. if ($name) {
  542. print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes. Current feature: $name\r";
  543. }
  544. else {
  545. print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes. Current feature: $uname\r";
  546. }
  547. tripal_set_job_progress($job, intval(($num_read / $filesize) * 100));
  548. }
  549. }
  550. }
  551. // now load the last sequence in the file
  552. tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id,
  553. $accession, $subject, $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm,
  554. $source, $residues, $method, $re_name, $match_type, $parentcvterm, $relcvterm);
  555. }
  556. catch (Exception $e) {
  557. $transaction->rollback();
  558. print "\n"; // make sure we start errors on new line
  559. watchdog_exception('T_fasta_loader', $e);
  560. print "FAILED: Rolling back database changes...\n";
  561. }
  562. print "\nDone\n";
  563. }
  564. /**
  565. * A helper function for tripal_feature_load_fasta() to load a single feature
  566. *
  567. * @param $name
  568. * @param $uname
  569. * @param $db_id
  570. * @param $accession
  571. * @param $parent
  572. * @param $rel_type
  573. * @param $parent_type
  574. * @param $analysis_id
  575. * @param $organism_id
  576. * @param $cvterm
  577. * @param $source
  578. * @param $residues
  579. * @param $method
  580. * @param $re_name
  581. * @param $match_type
  582. * @param $parentcvterm
  583. * @param $relcvterm
  584. *
  585. * @ingroup fasta_loader
  586. */
  587. function tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id, $accession,
  588. $parent, $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm,
  589. $source, $residues, $method, $re_name, $match_type, $parentcvterm, $relcvterm) {
  590. // check to see if this feature already exists if the match_type is 'Name'
  591. if (strcmp($match_type, 'Name')==0) {
  592. $values = array(
  593. 'organism_id' => $organism_id,
  594. 'name' => $name,
  595. 'type_id' => $cvterm->cvterm_id,
  596. );
  597. $options = array('statement_name' => 'sel_feature_ornaty');
  598. $results = chado_select_record('feature', array('feature_id'), $values, $options);
  599. if (count($results) > 1) {
  600. tripal_report_error('T_fasta_loader', "Multiple features exist with the name '%name' of type
  601. '%type' for the organism. skipping", array('%name' => $name, '%type' => $type));
  602. return 0;
  603. }
  604. if (count($results) == 1) {
  605. $feature = $results[0];
  606. }
  607. }
  608. // check to see if this feature already exists if the match_type is 'Unique Name'
  609. if (strcmp($match_type, 'Unique name')==0) {
  610. $values = array(
  611. 'organism_id' => $organism_id,
  612. 'uniquename' => $uname,
  613. 'type_id' => $cvterm->cvterm_id,
  614. );
  615. $options = array('statement_name' => 'sel_feature_oruqty');
  616. $results = chado_select_record('feature', array('feature_id'), $values, $options);
  617. if (count($results) > 1) {
  618. tripal_report_error('T_fasta_loader', "Multiple features exist with the name '%name' of type
  619. '%type' for the organism. skipping", array('%name' => $name, '%type' => $type));
  620. return 0;
  621. }
  622. if (count($results) == 1) {
  623. $feature = $results[0];
  624. }
  625. // if the feature exists but this is an "insert only" method then skip this feature
  626. if ($feature and (strcmp($method, 'Insert only')==0)) {
  627. tripal_report_error('T_fasta_loader', TRIPAL_WARNING, "Feature already exists '%name' ('%uname') while matching on %type. Skipping insert.",
  628. array('%name' => $name, '%uname' => $uname, '%type' => drupal_strtolower($match_type)));
  629. return 0;
  630. }
  631. }
  632. // if we don't have a feature and we're doing an insert then do the insert
  633. $inserted = 0;
  634. if (!$feature and (strcmp($method, 'Insert only')==0 or strcmp($method, 'Insert and update')==0)) {
  635. // if we have a unique name but not a name then set them to be the same and vice versa
  636. if (!$uname) {
  637. $uname = $name;
  638. }
  639. elseif (!$name) {
  640. $name = $uname;
  641. }
  642. // insert the feature
  643. $values = array(
  644. 'organism_id' => $organism_id,
  645. 'name' => $name,
  646. 'uniquename' => $uname,
  647. 'residues' => $residues,
  648. 'seqlen' => drupal_strlen($residues),
  649. 'md5checksum' => md5($residues),
  650. 'type_id' => $cvterm->cvterm_id,
  651. 'is_analysis' => 'FALSE',
  652. 'is_obsolete' => 'FALSE',
  653. );
  654. $options = array('statement_name' => 'ins_feature_all');
  655. $success = chado_insert_record('feature', $values, $options);
  656. if (!$success) {
  657. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to insert feature '%name (%uname)'",
  658. array('%name' => $name, '%uname' => $numane));
  659. return 0;
  660. }
  661. // now get the feature we just inserted
  662. $values = array(
  663. 'organism_id' => $organism_id,
  664. 'uniquename' => $uname,
  665. 'type_id' => $cvterm->cvterm_id,
  666. );
  667. $options = array('statement_name' => 'sel_feature_oruqty');
  668. $results = chado_select_record('feature', array('feature_id'), $values, $options);
  669. if (count($results) == 1) {
  670. $inserted = 1;
  671. $feature = $results[0];
  672. }
  673. else {
  674. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to retreive newly inserted feature '%name (%uname)'",
  675. array('%name' => $name, '%uname' => $numane));
  676. return 0;
  677. }
  678. }
  679. // if we don't have a feature and the user wants to do an update then fail
  680. if (!$feature and (strcmp($method, 'Update only')==0 or drupal_strcmp($method, 'Insert and update')==0)) {
  681. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to find feature '%name' ('%uname') while matching on " .
  682. drupal_strtolower($match_type), array('%name' => $name, '%uname' => $uname));
  683. return 0;
  684. }
  685. // if we do have a feature and this is an update then proceed with the update
  686. if ($feature and !$inserted and (strcmp($method, 'Update only')==0 or strcmp($method, 'Insert and update')==0)) {
  687. // if the user wants to match on the Name field
  688. if (strcmp($match_type, 'Name')==0) {
  689. // if we're matching on the name but do not have a unique name then we don't want to update the uniquename.
  690. $values = array();
  691. if ($uname) {
  692. // first check to make sure that by changing the unique name of this feature that we won't conflict with
  693. // another existing feature of the same name
  694. $values = array(
  695. 'organism_id' => $organism_id,
  696. 'uniquename' => $uname,
  697. 'type_id' => $cvterm->cvterm_id,
  698. );
  699. $options = array('statement_name' => 'sel_feature_oruqty');
  700. $results = chado_select_record('feature', array('feature_id'), $values, $options);
  701. if (count($results) > 0) {
  702. tripal_report_error('T_fasta_loader', "Cannot update the feature '%name' with a uniquename of '%uname' and type of '%type' as it
  703. conflicts with an existing feature with the same uniquename and type.",
  704. array('%name' => $name, '%uname' => $uname, '%type' => $type));
  705. return 0;
  706. }
  707. // the changes to the uniquename don't conflict so proceed with the update
  708. $values = array(
  709. 'uniquename' => $uname,
  710. 'residues' => $residues,
  711. 'seqlen' => drupal_strlen($residues),
  712. 'md5checksum' => md5($residues),
  713. 'is_analysis' => 'false',
  714. 'is_obsolete' => 'false',
  715. );
  716. $match = array(
  717. 'name' => $name,
  718. 'organism_id' => $organism_id,
  719. 'type_id' => $cvterm->cvterm_id,
  720. );
  721. $options = array('statement_name' => 'upd_feature_resemdisis_naorty_un');
  722. }
  723. // if we do not have a new unique name then don't change the existing uniquename field
  724. else {
  725. $values = array(
  726. 'residues' => $residues,
  727. 'seqlen' => drupal_strlen($residues),
  728. 'md5checksum' => md5($residues),
  729. 'is_analysis' => 'false',
  730. 'is_obsolete' => 'false',
  731. );
  732. $match = array(
  733. 'name' => $name,
  734. 'organism_id' => $organism_id,
  735. 'type_id' => $cvterm->cvterm_id,
  736. );
  737. $options = array('statement_name' => 'upd_feature_unresemdisis_naorty');
  738. }
  739. // perform the update
  740. $success = chado_update_record('feature', $match, $values, $options);
  741. if (!$success) {
  742. tripal_report_error('T_fasta_loader', TRIPAL_ERROR,
  743. "Failed to update feature '%name' ('%name')",
  744. array('%name' => $name, '%uiname' => $uname));
  745. return 0;
  746. }
  747. }
  748. if (strcmp($match_type, 'Unique name')==0) {
  749. // if we're matching on the uniquename but do not have a new name then we don't want to update the name.
  750. $values = array();
  751. if ($name) {
  752. $values = array(
  753. 'name' => $name,
  754. 'residues' => $residues,
  755. 'seqlen' => drupal_strlen($residues),
  756. 'md5checksum' => md5($residues),
  757. 'is_analysis' => 'false',
  758. 'is_obsolete' => 'false',
  759. );
  760. $match = array(
  761. 'uniquename' => $uname,
  762. 'organism_id' => $organism_id,
  763. 'type_id' => $cvterm->cvterm_id,
  764. );
  765. $options = array('statement_name' => 'upd_feature_resemdisis_unorty_na');
  766. }
  767. // if we have a unique name then update it after matching by the name
  768. else {
  769. $values = array(
  770. 'residues' => $residues,
  771. 'seqlen' => drupal_strlen($residues),
  772. 'md5checksum' => md5($residues),
  773. 'is_analysis' => 'false',
  774. 'is_obsolete' => 'false',
  775. );
  776. $match = array(
  777. 'uniquename' => $uname,
  778. 'organism_id' => $organism_id,
  779. 'type_id' => $cvterm->cvterm_id,
  780. );
  781. $options = array('statement_name' => 'upd_feature_naresemdisis_unorty');
  782. }
  783. $success = chado_update_record('feature', $match, $values, $options);
  784. if (!$success) {
  785. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to update feature '%name' ('%name')",
  786. array('%name' => $name, '%uiname' => $uname));
  787. return 0;
  788. }
  789. }
  790. }
  791. // add in the analysis link
  792. if ($analysis_id) {
  793. // if the association doens't alredy exist then add one
  794. $values = array(
  795. 'analysis_id' => $analysis_id,
  796. 'feature_id' => $feature->feature_id,
  797. );
  798. $sel_options = array('statement_name' => 'sel_analysisfeature_anfe');
  799. $results = chado_select_record('analysisfeature', array('analysisfeature_id'), $values, $sel_options);
  800. if (count($results) == 0) {
  801. $ins_options = array('statement_name' => 'ins_analysisfeature_anfe');
  802. $success = chado_insert_record('analysisfeature', $values, $ins_options);
  803. if (!$success) {
  804. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to associate analysis and feature '%name' ('%name')",
  805. array('%name' => $name, '%uname' => $uname));
  806. return 0;
  807. }
  808. }
  809. }
  810. // now add the database cross reference
  811. if ($db_id) {
  812. // check to see if this accession reference exists, if not add it
  813. $values = array(
  814. 'db_id' => $db_id,
  815. 'accession' => $accession
  816. );
  817. $sel_options = array('statement_name' => 'sel_dbxref_dbac');
  818. $results = chado_select_record('dbxref', array('dbxref_id'), $values, $sel_options);
  819. // if the accession doesn't exist then add it
  820. if (count($results) == 0) {
  821. $ins_options = array('statement_name' => 'ins_dbxref_dbac');
  822. $results = chado_insert_record('dbxref', $values, $ins_options);
  823. if (!$results) {
  824. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to add database accession '%accession'",
  825. array('%accession' => $accession));
  826. return 0;
  827. }
  828. $results = chado_select_record('dbxref', array('dbxref_id'), $values, $sel_options);
  829. if (count($results) == 1) {
  830. $dbxref = $results[0];
  831. }
  832. else {
  833. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to retreive newly inserted dbxref '%name (%uname)'",
  834. array('%name' => $name, '%uname' => $numane));
  835. return 0;
  836. }
  837. }
  838. else {
  839. $dbxref = $results[0];
  840. }
  841. // check to see if the feature dbxref record exists if not, then add it
  842. $values = array(
  843. 'feature_id' => $feature->feature_id,
  844. 'dbxref_id' => $dbxref->dbxref_id
  845. );
  846. $sel_options = array('statement_name' => 'sel_featuredbxref_fedb');
  847. $results = chado_select_record('feature_dbxref', array('feature_dbxref_id'), $values, $sel_options);
  848. if (count($results) == 0) {
  849. $ins_options = array('statement_name' => 'ins_featuredbxref_fedb');
  850. $success = chado_insert_record('feature_dbxref', $values, $ins_options);
  851. if (!$success) {
  852. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to add associate database accession '%accession' with feature",
  853. array('%accession' => $accession));
  854. return 0;
  855. }
  856. }
  857. }
  858. // now add in the relationship if one exists. If not, then add it
  859. if ($rel_type) {
  860. $values = array(
  861. 'organism_id' => $organism_id,
  862. 'uniquename' => $parent,
  863. 'type_id' => $parentcvterm->cvterm_id,
  864. );
  865. $options = array('statement_name' => 'sel_feature_oruqty');
  866. $results = chado_select_record('feature', array('feature_id'), $values, $options);
  867. if (count($results) != 1) {
  868. tripal_report_error('T_fasta_loader', "Cannot find a unique fature for the parent '%parent' of type
  869. '%type' for the feature.", array('%parent' => $parent, '%type' => $parent_type));
  870. return 0;
  871. }
  872. $parent_feature = $results[0];
  873. // check to see if the relationship already exists if not then add it
  874. $values = array(
  875. 'subject_id' => $feature->feature_id,
  876. 'object_id' => $parent_feature->feature_id,
  877. 'type_id' => $relcvterm->cvterm_id,
  878. );
  879. $sel_options = array('statement_name' => 'sel_featurerelationship_suojty');
  880. $results = chado_select_record('feature_relationship', array('feature_relationship_id'), $values, $sel_options);
  881. if (count($results) == 0) {
  882. $ins_options = array('statement_name' => 'ins_featurerelationship_suojty');
  883. $success = chado_insert_record('feature_relationship', $values, $ins_options);
  884. if (!$success) {
  885. tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to add associate database accession '%accession' with feature",
  886. array('%accession' => $accession));
  887. return 0;
  888. }
  889. }
  890. }
  891. }