fasta_loader.inc 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875
  1. <?php
  2. /**
  3. * @file
  4. * @todo Add file header description
  5. */
  6. /**
  7. * @defgroup fasta_loader FASTA Feature Loader
  8. * @{
  9. * Provides fasta loading functionality. Creates features based on their specification in a fasta file.
  10. * @}
  11. * @ingroup tripal_feature
  12. */
  13. /**
  14. *
  15. *
  16. * @ingroup fasta_loader
  17. */
  18. function tripal_feature_fasta_load_form( ) {
  19. $form['fasta_file']= array(
  20. '#type' => 'textfield',
  21. '#title' => t('FASTA File'),
  22. '#description' => t('Please enter the full system path for the FASTA file, or a path within the Drupal
  23. installation (e.g. /sites/default/files/xyz.obo). The path must be accessible to the
  24. server on which this Drupal instance is running.'),
  25. '#required' => TRUE,
  26. );
  27. // get the list of organisms
  28. $sql = "SELECT * FROM {organism} ORDER BY genus, species";
  29. $previous_db = tripal_db_set_active('chado'); // use chado database
  30. $org_rset = db_query($sql);
  31. tripal_db_set_active($previous_db); // now use drupal database
  32. $organisms = array();
  33. $organisms[''] = '';
  34. while ($organism = db_fetch_object($org_rset)) {
  35. $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
  36. }
  37. $form['organism_id'] = array(
  38. '#title' => t('Organism'),
  39. '#type' => t('select'),
  40. '#description' => t("Choose the organism to which these sequences are associated"),
  41. '#required' => TRUE,
  42. '#options' => $organisms,
  43. );
  44. $form['seqtype']= array(
  45. '#type' => 'textfield',
  46. '#title' => t('Sequence Type'),
  47. '#required' => TRUE,
  48. '#description' => t('Please enter the Sequence Ontology term that describes the sequences in the FASTA file.'),
  49. );
  50. // get the list of organisms
  51. $sql = "SELECT L.library_id, L.name, CVT.name as type
  52. FROM {library} L
  53. INNER JOIN {cvterm} CVT ON L.type_id = CVT.cvterm_id
  54. ORDER BY name";
  55. $previous_db = tripal_db_set_active('chado'); // use chado database
  56. $lib_rset = db_query($sql);
  57. tripal_db_set_active($previous_db); // now use drupal database
  58. $libraries = array();
  59. $libraries[''] = '';
  60. while ($library = db_fetch_object($lib_rset)) {
  61. $libraries[$library->library_id] = "$library->name ($library->type)";
  62. }
  63. // $form['library_id'] = array (
  64. // '#title' => t('Library'),
  65. // '#type' => t('select'),
  66. // '#description' => t("Choose the library to which these sequences are associated "),
  67. // '#required' => FALSE,
  68. // '#options' => $libraries,
  69. // '#weight' => 5,
  70. // );
  71. $form['method']= array(
  72. '#type' => 'radios',
  73. '#title' => 'Method',
  74. '#required' => TRUE,
  75. '#options' => array(
  76. t('Insert only'),
  77. t('Update only'),
  78. t('Insert and update'),
  79. ),
  80. '#description' => t('Select how features in the FASTA file are handled.
  81. Select "Insert only" to insert the new features. If a feature already
  82. exists with the same name or unique name and type then it is skipped.
  83. Select "Update only" to only update featues that already exist in the
  84. database. Select "Insert and Update" to insert features that do
  85. not exist and upate those that do.'),
  86. '#default_value' => 2,
  87. );
  88. $form['match_type']= array(
  89. '#type' => 'radios',
  90. '#title' => 'Name Match Type',
  91. '#required' => TRUE,
  92. '#options' => array(
  93. t('Name'),
  94. t('Unique name'),
  95. ),
  96. '#description' => t('Feature data is stored in Chado with both a human-readable
  97. name and a unique name. If the features in your FASTA file are identified using
  98. a human-readable name then select the "Name" button. If your features are
  99. identified using the unique name then select the "Unique name" button. If you
  100. loaded your features first using the GFF loader then the unique name of each
  101. features were indicated by the "ID=" attribute and the name by the "Name=" attribute.
  102. By default, the FASTA loader will use the first word (character string
  103. before the first space) as the name for your feature. If
  104. this does not uniquely identify your feature consider specifying a regular expression in the advanced section below.
  105. Additionally, you may import both a name and a unique name for each sequence using the advanced options.
  106. When updating a sequence, the value selected here will be used to identify the sequence in the
  107. database in combination with any regular expression provided below.'),
  108. '#default_value' => 1,
  109. );
  110. $form['analysis'] = array(
  111. '#type' => 'fieldset',
  112. '#title' => t('Analysis Used to Derive Features'),
  113. '#collapsed' => TRUE
  114. );
  115. $form['analysis']['desc'] = array(
  116. '#type' => 'markup',
  117. '#value' => t("Why specify an analysis for a data load? All data comes
  118. from some place, even if downloaded from Genbank. By specifying
  119. analysis details for all data uploads, it allows an end user to reproduce the
  120. data set, but at least indicates the source of the data."),
  121. );
  122. // get the list of organisms
  123. $sql = "SELECT * FROM {analysis} ORDER BY name";
  124. $previous_db = tripal_db_set_active('chado'); // use chado database
  125. $org_rset = db_query($sql);
  126. tripal_db_set_active($previous_db); // now use drupal database
  127. $analyses = array();
  128. $analyses[''] = '';
  129. while ($analysis = db_fetch_object($org_rset)) {
  130. $analyses[$analysis->analysis_id] = "$analysis->name ($analysis->program $analysis->programversion, $analysis->sourcename)";
  131. }
  132. $form['analysis']['analysis_id'] = array(
  133. '#title' => t('Analysis'),
  134. '#type' => t('select'),
  135. '#description' => t("Choose the analysis to which these features are associated"),
  136. '#required' => TRUE,
  137. '#options' => $analyses,
  138. );
  139. // Advanced Options
  140. $form['advanced'] = array(
  141. '#type' => 'fieldset',
  142. '#title' => t('Advanced Options'),
  143. '#collapsible' => TRUE,
  144. '#collapsed' => TRUE
  145. );
  146. $form['advanced']['re_help']= array(
  147. '#type' => 'item',
  148. '#value' => t('A regular expression is an advanced method for extracting information from a string of text.
  149. Your FASTA file may contain both a human-readable name and a unique name for each sequence.
  150. If you want to import
  151. both the name and unique name for all sequences, then you must provide regular expressions
  152. so that the loader knows how to separate them.
  153. Otherwise the name and uniquename will be the same.
  154. By default, this loader will use the first word in the definition
  155. lines of the FASTA file
  156. as the name or unique name of the feature.'),
  157. );
  158. $form['advanced']['re_name']= array(
  159. '#type' => 'textfield',
  160. '#title' => t('Regular expression for the name'),
  161. '#required' => FALSE,
  162. '#description' => t('Enter the regular expression that will extract the
  163. feature name from the FASTA definition line. For example, for a
  164. defintion line with a name and unique name separated by a bar \'|\' (>seqname|uniquename),
  165. the regular expression for the name would be, "^(.*?)\|.*$".'),
  166. );
  167. $form['advanced']['re_uname']= array(
  168. '#type' => 'textfield',
  169. '#title' => t('Regular expression for the unique name'),
  170. '#required' => FALSE,
  171. '#description' => t('Enter the regular expression that will extract the
  172. feature name from the FASTA definition line. For example, for a
  173. defintion line with a name and unique name separated by a bar \'|\' (>seqname|uniquename),
  174. the regular expression for the unique name would be "^.*?\|(.*)$").'),
  175. );
  176. // Advanced database cross-reference optoins
  177. $form['advanced']['db'] = array(
  178. '#type' => 'fieldset',
  179. '#title' => t('External Database Reference'),
  180. '#weight' => 6,
  181. '#collapsed' => TRUE
  182. );
  183. $form['advanced']['db']['re_accession']= array(
  184. '#type' => 'textfield',
  185. '#title' => t('Regular expression for the accession'),
  186. '#required' => FALSE,
  187. '#description' => t('Enter the regular expression that will extract the accession for the external database for each feature from the FASTA definition line.'),
  188. '#weight' => 2
  189. );
  190. // get the list of databases
  191. $sql = "SELECT * FROM {db} ORDER BY name";
  192. $previous_db = tripal_db_set_active('chado'); // use chado database
  193. $db_rset = db_query($sql);
  194. tripal_db_set_active($previous_db); // now use drupal database
  195. $dbs = array();
  196. $dbs[''] = '';
  197. while ($db = db_fetch_object($db_rset)) {
  198. $dbs[$db->db_id] = "$db->name";
  199. }
  200. $form['advanced']['db']['db_id'] = array(
  201. '#title' => t('External Database'),
  202. '#type' => t('select'),
  203. '#description' => t("Plese choose an external database for which these sequences have a cross reference."),
  204. '#required' => FALSE,
  205. '#options' => $dbs,
  206. '#weight' => 1,
  207. );
  208. $form['advanced']['relationship'] = array(
  209. '#type' => 'fieldset',
  210. '#title' => t('Relationships'),
  211. '#weight' => 6,
  212. '#collapsed' => TRUE
  213. );
  214. $rels = array();
  215. $rels[''] = '';
  216. $rels['part_of'] = 'part of';
  217. $rels['derives_from'] = 'produced by';
  218. // Advanced references options
  219. $form['advanced']['relationship']['rel_type']= array(
  220. '#title' => t('Relationship Type'),
  221. '#type' => t('select'),
  222. '#description' => t("Use this option to create associations, or relationships between the
  223. features of this FASTA file and existing features in the database. For
  224. example, to associate a FASTA file of peptides to existing genes or transcript sequence,
  225. select the type 'produced by'. For a CDS sequences select the type 'part of'"),
  226. '#required' => FALSE,
  227. '#options' => $rels,
  228. '#weight' => 5,
  229. );
  230. $form['advanced']['relationship']['re_subject']= array(
  231. '#type' => 'textfield',
  232. '#title' => t('Regular expression for the parent'),
  233. '#required' => FALSE,
  234. '#description' => t('Enter the regular expression that will extract the unique
  235. name needed to identify the existing sequence for which the
  236. relationship type selected above will apply.'),
  237. '#weight' => 6
  238. );
  239. $form['advanced']['relationship']['parent_type']= array(
  240. '#type' => 'textfield',
  241. '#title' => t('Parent Type'),
  242. '#required' => FALSE,
  243. '#description' => t('Please enter the Sequence Ontology term for the parent. For example
  244. if the FASTA file being loaded is a set of proteins that are
  245. products of genes, then use the SO term \'gene\' or \'transcript\' or equivalent. However,
  246. this type must match the type for already loaded features.'),
  247. '#weight' => 7
  248. );
  249. $form['button'] = array(
  250. '#type' => 'submit',
  251. '#value' => t('Import FASTA file'),
  252. '#weight' => 10,
  253. );
  254. return $form;
  255. }
  256. /**
  257. *
  258. *
  259. * @ingroup fasta_loader
  260. */
  261. function tripal_feature_fasta_load_form_validate($form, &$form_state) {
  262. $fasta_file = trim($form_state['values']['fasta_file']);
  263. $organism_id = $form_state['values']['organism_id'];
  264. $type = trim($form_state['values']['seqtype']);
  265. $method = trim($form_state['values']['method']);
  266. $match_type = trim($form_state['values']['match_type']);
  267. $library_id = $form_state['values']['library_id'];
  268. $re_name = trim($form_state['values']['re_name']);
  269. $re_uname = trim($form_state['values']['re_uname']);
  270. $re_accession = trim($form_state['values']['re_accession']);
  271. $db_id = $form_state['values']['db_id'];
  272. $rel_type = $form_state['values']['rel_type'];
  273. $re_subject = trim($form_state['values']['re_subject']);
  274. $parent_type = trim($form_state['values']['parent_type']);
  275. if ($method == 0) {
  276. $method = 'Insert only';
  277. }
  278. if ($method == 1) {
  279. $method = 'Update only';
  280. }
  281. if ($method == 2) {
  282. $method = 'Insert and update';
  283. }
  284. if ($match_type == 0) {
  285. $match_type = 'Name';
  286. }
  287. if ($match_type == 1) {
  288. $match_type = 'Unique name';
  289. }
  290. if ($re_name and !$re_uname and strcmp($match_type, 'Unique name')==0) {
  291. form_set_error('re_uname', t("You must provide a regular expression to identify the sequence unique name"));
  292. }
  293. if (!$re_name and $re_uname and strcmp($match_type, 'Name')==0) {
  294. form_set_error('re_name', t("You must provide a regular expression to identify the sequence name"));
  295. }
  296. // check to see if the file is located local to Drupal
  297. $fasta_file = trim($fasta_file);
  298. $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $fasta_file;
  299. if (!file_exists($dfile)) {
  300. // if not local to Drupal, the file must be someplace else, just use
  301. // the full path provided
  302. $dfile = $fasta_file;
  303. }
  304. if (!file_exists($dfile)) {
  305. form_set_error('fasta_file', t("Cannot find the file on the system. Check that the file exists or that the web server has permissions to read the file."));
  306. }
  307. // make sure if a relationship is specified that all fields are provided.
  308. if (($rel_type or $parent_type) and !$re_subject) {
  309. form_set_error('re_subject', t("Please provide a regular expression for the parent"));
  310. }
  311. if (($rel_type or $re_subject) and !$parent_type) {
  312. form_set_error('parent_type', t("Please provide a SO term for the parent"));
  313. }
  314. if (($parent_type or $re_subject) and !$rel_type) {
  315. form_set_error('rel_type', t("Please select a relationship type"));
  316. }
  317. // make sure if a database is specified that all fields are provided
  318. if ($db_id and !$re_accession) {
  319. form_set_error('re_accession', t("Please provide a regular expression for the accession"));
  320. }
  321. if ($re_accession and !$db_id) {
  322. form_set_error('db_id', t("Please select a database"));
  323. }
  324. // check to make sure the types exists
  325. $cvtermsql = "SELECT CVT.cvterm_id
  326. FROM {cvterm} CVT
  327. INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  328. LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
  329. WHERE cv.name = '%s' and (CVT.name = '%s' or CVTS.synonym = '%s')";
  330. $cvterm = db_fetch_object(db_query($cvtermsql, 'sequence', $type, $type));
  331. if (!$cvterm) {
  332. form_set_error('type', t("The Sequence Ontology (SO) term selected for the sequence type is not available in the database. Please check spelling or select another."));
  333. }
  334. if ($rel_type) {
  335. $cvterm = db_fetch_object(db_query($cvtermsql, 'sequence', $parent_type, $parent_type));
  336. if (!$cvterm) {
  337. form_set_error('parent_type', t("The Sequence Ontology (SO) term selected for the parent relationship is not available in the database. Please check spelling or select another."));
  338. }
  339. }
  340. // check to make sure the 'relationship' and 'sequence' ontologies are loaded
  341. $form_state['storage']['dfile'] = $dfile;
  342. }
  343. /**
  344. *
  345. *
  346. * @ingroup fasta_loader
  347. */
  348. function tripal_feature_fasta_load_form_submit($form, &$form_state) {
  349. global $user;
  350. $dfile = $form_state['storage']['dfile'];
  351. $organism_id = $form_state['values']['organism_id'];
  352. $type = trim($form_state['values']['seqtype']);
  353. $method = trim($form_state['values']['method']);
  354. $match_type = trim($form_state['values']['match_type']);
  355. $library_id = $form_state['values']['library_id'];
  356. $re_name = trim($form_state['values']['re_name']);
  357. $re_uname = trim($form_state['values']['re_uname']);
  358. $re_accession = trim($form_state['values']['re_accession']);
  359. $db_id = $form_state['values']['db_id'];
  360. $rel_type = $form_state['values']['rel_type'];
  361. $re_subject = trim($form_state['values']['re_subject']);
  362. $parent_type = trim($form_state['values']['parent_type']);
  363. $analysis_id = $form_state['values']['analysis_id'];
  364. if ($method == 0) {
  365. $method = 'Insert only';
  366. }
  367. if ($method == 1) {
  368. $method = 'Update only';
  369. }
  370. if ($method == 2) {
  371. $method = 'Insert and update';
  372. }
  373. if ($match_type == 0) {
  374. $match_type = 'Name';
  375. }
  376. if ($match_type == 1) {
  377. $match_type = 'Unique name';
  378. }
  379. $args = array($dfile, $organism_id, $type, $library_id, $re_name, $re_uname,
  380. $re_accession, $db_id, $rel_type, $re_subject, $parent_type, $method,
  381. $user->uid, $analysis_id, $match_type);
  382. $fname = preg_replace("/.*\/(.*)/", "$1", $dfile);
  383. tripal_add_job("Import FASTA file: $fname", 'tripal_feature',
  384. 'tripal_feature_load_fasta', $args, $user->uid);
  385. }
  386. /**
  387. *
  388. *
  389. * @ingroup fasta_loader
  390. */
  391. function tripal_feature_load_fasta($dfile, $organism_id, $type,
  392. $library_id, $re_name, $re_uname, $re_accession, $db_id, $rel_type,
  393. $re_subject, $parent_type, $method, $uid, $analysis_id,
  394. $match_type, $job = NULL) {
  395. // begin the transaction
  396. $connection = tripal_db_start_transaction();
  397. // if we cannot get a connection then let the user know the loading will be slow
  398. if (!$connection) {
  399. print "A persistant connection was not obtained. Loading will be slow\n";
  400. }
  401. else {
  402. print "\nNOTE: Loading of this FASTA file is performed using a database transaction. \n" .
  403. "If the load fails or is terminated prematurely then the entire set of \n" .
  404. "insertions/updates is rolled back and will not be found in the database\n\n";
  405. }
  406. // first get the type for this sequence
  407. $cvtermsql = "SELECT CVT.cvterm_id
  408. FROM {cvterm} CVT
  409. INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  410. LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
  411. WHERE cv.name = '%s' and (CVT.name = '%s' or CVTS.synonym = '%s')";
  412. $cvterm = db_fetch_object(chado_query($cvtermsql, 'sequence', $type, $type));
  413. if (!$cvterm) {
  414. watchdog("T_fasta_loader", "Cannot find the term type: '%type'", array('%type' => $type), WATCHDOG_ERROR);
  415. return 0;
  416. }
  417. if ($parent_type) {
  418. $parentcvterm = db_fetch_object(db_query($cvtermsql, 'sequence', $parent_type, $parent_type));
  419. if (!$parentcvterm) {
  420. watchdog("T_fasta_loader", "Cannot find the paretne term type: '%type'", array('%type' => $parentcvterm), WATCHDOG_ERROR);
  421. return 0;
  422. }
  423. }
  424. if ($rel_type) {
  425. $relcvterm = db_fetch_object(db_query($cvtermsql, 'relationship', $rel_type, $rel_type));
  426. if (!$relcvterm) {
  427. watchdog("T_fasta_loader", "Cannot find the relationship term type: '%type'", array('%type' => $relcvterm), WATCHDOG_ERROR);
  428. return 0;
  429. }
  430. }
  431. print "Opening FASTA file $dfile\n";
  432. //$lines = file($dfile, FILE_SKIP_EMPTY_LINES);
  433. $fh = fopen($dfile, 'r');
  434. if (!$fh) {
  435. watchdog('T_fasta_loader', "cannot open file: %dfile", array('%dfile' => $dfile), WATCHDOG_ERROR);
  436. return 0;
  437. }
  438. $filesize = filesize($dfile);
  439. $i = 0;
  440. $name = '';
  441. $uname = '';
  442. $residues = '';
  443. $interval = intval($filesize * 0.01);
  444. if ($interval < 1) {
  445. $interval = 1;
  446. }
  447. $inv_read = 0;
  448. //foreach ($lines as $line_num => $line) {
  449. while ($line = fgets($fh)) {
  450. $i++; // update the line count
  451. $num_read += drupal_strlen($line);
  452. $intv_read += drupal_strlen($line);
  453. // if we encounter a definition line then get the name, uniquename,
  454. // accession and relationship subject from the definition line
  455. if (preg_match('/^>/', $line)) {
  456. // if we have a feature name then we are starting a new sequence
  457. // so let's handle the previous one before moving on
  458. if ($name or $uname) {
  459. tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id,
  460. $accession, $subject, $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm,
  461. $source, $residues, $method, $re_name, $match_type, $parentcvterm, $relcvterm);
  462. $residues = '';
  463. $name = '';
  464. $uname = '';
  465. }
  466. $line = preg_replace("/^>/", '', $line);
  467. // get the feature name
  468. if ($re_name) {
  469. if (!preg_match("/$re_name/", $line, $matches)) {
  470. print "WARNING: Regular expression for the feature name finds nothing\n";
  471. }
  472. $name = trim($matches[1]);
  473. }
  474. else {
  475. // if the match_type is name and no regular expression was provided
  476. // then use the first word as the name, otherwise we don't set the name
  477. if (strcmp($match_type, 'Name')==0) {
  478. preg_match("/^\s*(.*?)[\s\|].*$/", $line, $matches);
  479. $name = trim($matches[1]);
  480. }
  481. }
  482. // get the feature unique name
  483. if ($re_uname) {
  484. if (!preg_match("/$re_uname/", $line, $matches)) {
  485. print "WARNING: Regular expression for the feature unique name finds nothing\n";
  486. }
  487. $uname = trim($matches[1]);
  488. }
  489. else {
  490. // if the match_type is name and no regular expression was provided
  491. // then use the first word as the name, otherwise, we don't set the unqiuename
  492. if (strcmp($match_type, 'Unique name')==0) {
  493. preg_match("/^\s*(.*?)[\s\|].*$/", $line, $matches);
  494. $uname = trim($matches[1]);
  495. }
  496. }
  497. // get the accession
  498. preg_match("/$re_accession/", $line, $matches);
  499. $accession = trim($matches[1]);
  500. // get the relationship subject
  501. preg_match("/$re_subject/", $line, $matches);
  502. $subject = trim($matches[1]);
  503. }
  504. else {
  505. $residues .= trim($line);
  506. // update the job status every % features
  507. if ($job and $intv_read >= $interval) {
  508. $intv_read = 0;
  509. $percent = sprintf("%.2f", ($num_read / $filesize) * 100);
  510. if ($name) {
  511. print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes. Parsing: $name\r";
  512. }
  513. else {
  514. print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes. Parsing: $uname\r";
  515. }
  516. tripal_job_set_progress($job, intval(($num_read / $filesize) * 100));
  517. }
  518. }
  519. }
  520. // now load the last sequence in the file
  521. tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id,
  522. $accession, $subject, $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm,
  523. $source, $residues, $method, $re_name, $match_type, $parentcvterm, $relcvterm);
  524. // commit the transaction
  525. tripal_db_commit_transaction();
  526. print "Done\n";
  527. }
  528. /**
  529. *
  530. *
  531. * @ingroup fasta_loader
  532. */
  533. function tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id, $accession,
  534. $parent, $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm,
  535. $source, $residues, $method, $re_name, $match_type, $parentcvterm, $relcvterm) {
  536. // check to see if this feature already exists if the match_type is 'Name'
  537. if (strcmp($match_type, 'Name')==0) {
  538. $values = array(
  539. 'organism_id' => $organism_id,
  540. 'name' => $name,
  541. 'type_id' => $cvterm->cvterm_id,
  542. );
  543. $options = array('statement_name' => 'sel_feature_ornaty');
  544. $results = tripal_core_chado_select('feature', array('feature_id'), $values, $options);
  545. if (count($results) > 1) {
  546. watchdog('T_fasta_loader', "Multiple features exist with the name '%name' of type
  547. '%type' for the organism. skipping", array('%name' => $name, '%type' => $type));
  548. return 0;
  549. }
  550. if (count($results) == 1) {
  551. $reature = $results[0];
  552. }
  553. }
  554. // check to see if this feature already exists if the match_type is 'Unique Name'
  555. if (strcmp($match_type, 'Unique name')==0) {
  556. $values = array(
  557. 'organism_id' => $organism_id,
  558. 'uniquename' => $uname,
  559. 'type_id' => $cvterm->cvterm_id,
  560. );
  561. $options = array('statement_name' => 'sel_feature_oruqty');
  562. $results = tripal_core_chado_select('feature', array('feature_id'), $values, $options);
  563. if (count($results) > 1) {
  564. watchdog('T_fasta_loader', "Multiple features exist with the name '%name' of type
  565. '%type' for the organism. skipping", array('%name' => $name, '%type' => $type));
  566. return 0;
  567. }
  568. if (count($results) == 1) {
  569. $reature = $results[0];
  570. }
  571. }
  572. // if we don't have a feature and we're doing an insert then do the insert
  573. $inserted = 0;
  574. if (!$feature and (strcmp($method, 'Insert only')==0 or strcmp($method, 'Insert and update')==0)) {
  575. // if we have a unique name but not a name then set them to be the same
  576. // and vice versa
  577. if (!$uname) {
  578. $uname = $name;
  579. }
  580. elseif (!$name) {
  581. $name = $uname;
  582. }
  583. // insert the feature
  584. $values = array(
  585. 'organism_id' => $organism_id,
  586. 'name' => $name,
  587. 'uniquename' => $uname,
  588. 'residues' => $residues,
  589. 'seqlen' => drupal_strlen($residues),
  590. 'md5checksum' => md5($residues),
  591. 'type_id' => $cvterm->cvterm_id,
  592. 'is_analysis' => 'FALSE',
  593. 'is_obsolete' => 'FALSE',
  594. );
  595. $options = array('statement_name' => 'ins_feature_all');
  596. $success = tripal_core_chado_insert('feature', $values, $options);
  597. if (!$success) {
  598. watchdog('T_fasta_loader', "Failed to insert feature '%name (%uname)'",
  599. array('%name' => $name, '%uname' => $numane), WATCHDOG_ERROR);
  600. return 0;
  601. }
  602. // now get the feature we just inserted
  603. $values = array(
  604. 'organism_id' => $organism_id,
  605. 'uniquename' => $uname,
  606. 'type_id' => $cvterm->cvterm_id,
  607. );
  608. $options = array('statement_name' => 'sel_feature_oruqty');
  609. $results = tripal_core_chado_select('feature', array('feature_id'), $values, $options);
  610. if (count($results) == 1) {
  611. $inserted = 1;
  612. $feature = $results[0];
  613. }
  614. else {
  615. watchdog('T_fasta_loader', "Failed to retreive newly inserted feature '%name (%uname)'",
  616. array('%name' => $name, '%uname' => $numane), WATCHDOG_ERROR);
  617. return 0;
  618. }
  619. }
  620. // if we don't have a feature and the uesr wants to do an update then fail
  621. if (!$feature and (strcmp($method, 'Update only')==0 or drupal_strcmp($method, 'Insert and update')==0)) {
  622. watchdog('T_fasta_loader', "Failed to find feature '%name' ('%name') while matching on " .
  623. drupal_strtolower($match_type), array('%name' => $name, '%uiname' => $uname), WATCHDOG_ERROR);
  624. return 0;
  625. }
  626. // if we do have a feature and this is an update then proceed with the update
  627. if ($feature and !$inserted and (strcmp($method, 'Update only')==0 or strcmp($method, 'Insert and update')==0)) {
  628. if (strcmp($match_type, 'Name')==0) {
  629. // if we're matching on the name but do not have a new unique name then we don't want to update the uniquename.
  630. $values = array();
  631. if ($uname) {
  632. $values = array(
  633. 'residues' => $residues,
  634. 'seqlen' => drupal_strlen($residues),
  635. 'md5checksum' => md5($residues),
  636. 'is_analysis' => 'false',
  637. 'is_obsolete' => 'false',
  638. );
  639. $match = array(
  640. 'organism_id' => $organism_id,
  641. 'name' => $name,
  642. 'type_id' => $cvterm->cvterm_id,
  643. );
  644. $options = array('statement_name' => 'upd_feature_resemdisis_ornaty');
  645. }
  646. // if we have a unique name then update it after matching by the name
  647. else {
  648. $values = array(
  649. 'uniquename' => $uname,
  650. 'residues' => $residues,
  651. 'seqlen' => drupal_strlen($residues),
  652. 'md5checksum' => md5($residues),
  653. 'is_analysis' => 'false',
  654. 'is_obsolete' => 'false',
  655. );
  656. $match = array(
  657. 'name' => $name,
  658. 'organism_id' => $organism_id,
  659. 'type_id' => $cvterm->cvterm_id,
  660. );
  661. $options = array('statement_name' => 'upd_feature_unresemdisis_naorty');
  662. }
  663. $success = tripal_core_chado_update('feature', $match, $values, $options);
  664. if (!$success) {
  665. watchdog('T_fasta_loader', "Failed to update feature '%name' ('%name')",
  666. array('%name' => $name, '%uiname' => $uname), WATCHDOG_ERROR);
  667. return 0;
  668. }
  669. }
  670. else {
  671. // if we're matching on the uniquename but do not have a new name then we don't want to update the name.
  672. $values = array();
  673. if ($name) {
  674. $values = array(
  675. 'residues' => $residues,
  676. 'seqlen' => drupal_strlen($residues),
  677. 'md5checksum' => md5($residues),
  678. 'is_analysis' => 'false',
  679. 'is_obsolete' => 'false',
  680. );
  681. $match = array(
  682. 'organism_id' => $organism_id,
  683. 'uniquename' => $uname,
  684. 'type_id' => $cvterm->cvterm_id,
  685. );
  686. $options = array('statement_name' => 'upd_feature_resemdisis_orunty');
  687. }
  688. // if we have a unique name then update it after matching by the name
  689. else {
  690. $values = array(
  691. 'name' => $name,
  692. 'residues' => $residues,
  693. 'seqlen' => drupal_strlen($residues),
  694. 'md5checksum' => md5($residues),
  695. 'is_analysis' => 'false',
  696. 'is_obsolete' => 'false',
  697. );
  698. $match = array(
  699. 'uniquename' => $uname,
  700. 'organism_id' => $organism_id,
  701. 'type_id' => $cvterm->cvterm_id,
  702. );
  703. $options = array('statement_name' => 'upd_feature_naresemdisis_unorty');
  704. }
  705. $success = tripal_core_chado_update('feature', $match, $values, $options);
  706. if (!$success) {
  707. watchdog('T_fasta_loader', "Failed to update feature '%name' ('%name')",
  708. array('%name' => $name, '%uiname' => $uname), WATCHDOG_ERROR);
  709. return 0;
  710. }
  711. }
  712. }
  713. // add in the analysis link
  714. if ($analysis_id) {
  715. // if the association doens't alredy exist then add one
  716. $values = array(
  717. 'analysis_id' => $analysis_id,
  718. 'feature_id' => $feature->feature_id,
  719. );
  720. $sel_options = array('statement_name' => 'sel_analysisfeature_anfe');
  721. $results = tripal_core_chado_select('analysisfeature', array('analysisfeature_id'), $values, $sel_options);
  722. if (count($results) == 0) {
  723. $ins_options = array('statement_name' => 'ins_analysisfeature_anfe');
  724. $success = tripal_core_chado_insert('analysisfeature', $values, $ins_options);
  725. if (!$success) {
  726. watchdog('T_fasta_loader', "Failed to associate analysis and feature '%name' ('%name')",
  727. array('%name' => $name, '%uname' => $uname), WATCHDOG_ERROR);
  728. return 0;
  729. }
  730. }
  731. }
  732. // now add the database cross reference
  733. if ($db_id) {
  734. // check to see if this accession reference exists, if not add it
  735. $values = array(
  736. 'db_id' => $db_id,
  737. 'accession' => $accession
  738. );
  739. $sel_options = array('statement_name' => 'sel_dbxref_dbac');
  740. $results = tripal_core_chado_select('dbxref', array('dbxref_id'), $values, $sel_options);
  741. if (count($results) == 0) {
  742. $ins_options = array('statement_name' => 'sel_dbxref_dbac');
  743. $success = tripal_core_chado_insert('dbxref', $values, $ins_options);
  744. if (!$success) {
  745. watchdog('T_fasta_loader', "Failed to add database accession '%accession'",
  746. array('%accession' => $accession), WATCHDOG_ERROR);
  747. return 0;
  748. }
  749. $results = tripal_core_chado_select('dbxref', array('dbxref_id'), $values, $sel_options);
  750. if (count($results) == 1) {
  751. $dbxref = $results[0];
  752. }
  753. else {
  754. watchdog('T_fasta_loader', "Failed to retreive newly inserted dbxref '%name (%uname)'",
  755. array('%name' => $name, '%uname' => $numane), WATCHDOG_ERROR);
  756. return 0;
  757. }
  758. }
  759. // check to see if the feature dbxref record exists if not, then add it
  760. $values = array(
  761. 'feature_id' => $feature->feature_id,
  762. 'dbxref_id' => $dbxref->dbxref_id
  763. );
  764. $sel_options = array('statement_name' => 'sel_featuredbxref_fedb');
  765. $results = tripal_core_chado_select('feature_dbxref', array('feature_dbxref_id'), $values, $sel_options);
  766. if (count($results) == 0) {
  767. $ins_options = array('statement_name' => 'ins_featuredbxref_fedb');
  768. $success = tripal_core_chado_insert('feature_dbxref', $values, $ins_options);
  769. if (!$success) {
  770. watchdog('T_fasta_loader', "Failed to add associate database accession '%accession' with feature",
  771. array('%accession' => $accession), WATCHDOG_ERROR);
  772. return 0;
  773. }
  774. }
  775. }
  776. // now add in the relationship if one exists. If not, then add it
  777. if ($rel_type) {
  778. $values = array(
  779. 'organism_id' => $organism_id,
  780. 'uniquename' => $parent,
  781. 'type_id' => $parentcvterm->cvterm_id,
  782. );
  783. $options = array('statement_name' => 'sel_feature_oruqty');
  784. $results = tripal_core_chado_select('feature', array('feature_id'), $values, $options);
  785. if (count($results) != 1) {
  786. watchdog('T_fasta_loader', "Cannot find a unique fature for the parent '%parent' of type
  787. '%type' for the feature.", array('%parent' => $parent, '%type' => $parent_type));
  788. return 0;
  789. }
  790. $parent_feature = $results[0];
  791. // check to see if the relationship already exists if not then add it
  792. $values = array(
  793. 'subject_id' => $feature->feature_id,
  794. 'ojbect_id' => $parent_feature->feature_id,
  795. 'type_id' => $relcvterm->cvterm_id,
  796. );
  797. $sel_options = array('statement_name' => 'sel_featurerelationship_suojty');
  798. $results = tripal_core_chado_select('feature_relationship', array('feature_relationships_id'), $values, $sel_options);
  799. if (count($results) == 0) {
  800. $ins_options = array('statement_name' => 'sel_featurerelationship_suojty');
  801. $success = tripal_core_chado_insert('feature_relationship', $values, $ins_options);
  802. if (!$success) {
  803. watchdog('T_fasta_loader', "Failed to add associate database accession '%accession' with feature",
  804. array('%accession' => $accession), WATCHDOG_ERROR);
  805. return 0;
  806. }
  807. }
  808. }
  809. }