tripal_feature.sync_features.inc 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
  1. <?php
  2. /**
  3. * @file
  4. * @todo Add file header description
  5. */
  6. # This script can be run as a stand-alone script to sync all the features from chado to drupal
  7. // Parameter f specifies the feature_id to sync
  8. // -f 0 will sync all features
  9. $arguments = getopt("f:t:");
  10. if (isset($arguments['f']) and isset($arguments['t']) and $arguments['t'] == 'chado_feature') {
  11. $drupal_base_url = parse_url('http://www.example.com');
  12. $_SERVER['HTTP_HOST'] = $drupal_base_url['host'];
  13. $_SERVER['REQUEST_URI'] = $_SERVER['SCRIPT_NAME'] = $_SERVER['PHP_SELF'];
  14. $_SERVER['REMOTE_ADDR'] = NULL;
  15. $_SERVER['REQUEST_METHOD'] = NULL;
  16. require_once 'includes/bootstrap.inc';
  17. drupal_bootstrap(DRUPAL_BOOTSTRAP_FULL);
  18. $feature_id = $arguments['f'];
  19. if ($feature_id > 0) {
  20. tripal_feature_sync_feature($feature_id);
  21. }
  22. else {
  23. print "syncing all features...\n";
  24. tripal_feature_sync_features();
  25. }
  26. }
  27. /**
  28. *
  29. */
  30. function tripal_feature_sync_form() {
  31. $form['description'] = array(
  32. '#type' => 'item',
  33. '#value' => t("Add feature types, optionally select an organism and " .
  34. "click the 'Sync all Features' button to create Drupal " .
  35. "content for features in chado. Only features of the types listed " .
  36. "below in the Feature Types box will be synced. You may limit the " .
  37. "features to be synced by a specific organism. Depending on the " .
  38. "number of features in the chado database this may take a long " .
  39. "time to complete. "),
  40. );
  41. $form['feature_types'] = array(
  42. '#title' => t('Feature Types'),
  43. '#type' => 'textarea',
  44. '#description' => t("Enter the names of the feature types to sync. Pages for these feature " .
  45. "types will be created automatically for features that exist in the " .
  46. "chado database. The names listed here should be spearated by " .
  47. "spaces or entered separately on new lines. The names must match " .
  48. "exactly (spelling and case) with terms in the sequence ontology"),
  49. '#required' => TRUE,
  50. '#default_value' => variable_get('chado_sync_feature_types', 'gene contig'),
  51. );
  52. // get the list of organisms
  53. $sql = "SELECT * FROM {organism} ORDER BY genus, species";
  54. $orgs = tripal_organism_get_synced();
  55. $organisms[] = '';
  56. foreach ($orgs as $organism) {
  57. $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
  58. }
  59. $form['organism_id'] = array(
  60. '#title' => t('Organism'),
  61. '#type' => t('select'),
  62. '#description' => t("Choose the organism for which features types set above will be synced. Only organisms which also have been synced will appear in this list."),
  63. '#options' => $organisms,
  64. );
  65. $form['button'] = array(
  66. '#type' => 'submit',
  67. '#value' => t('Sync all Features'),
  68. '#weight' => 3,
  69. );
  70. return $form;
  71. }
  72. /**
  73. *
  74. */
  75. function tripal_feature_sync_form_validate($form, &$form_state) {
  76. $organism_id = $form_state['values']['organism_id'];
  77. $feature_types = $form_state['values']['feature_types'];
  78. // nothing to do
  79. }
  80. /**
  81. *
  82. */
  83. function tripal_feature_sync_form_submit($form, &$form_state) {
  84. global $user;
  85. $organism_id = $form_state['values']['organism_id'];
  86. $feature_types = $form_state['values']['feature_types'];
  87. $job_args = array(0, $organism_id, $feature_types);
  88. if ($organism_id) {
  89. $organism = tripal_core_chado_select('organism', array('genus', 'species'), array('organism_id' => $organism_id));
  90. $title = "Sync all features for " . $organism[0]->genus . " " . $organism[0]->species;
  91. }
  92. else {
  93. $title = t('Sync all features for all synced organisms');
  94. }
  95. variable_set('chado_sync_feature_types', $feature_types);
  96. tripal_add_job($title, 'tripal_feature', 'tripal_feature_sync_features', $job_args, $user->uid);
  97. }
  98. /**
  99. *
  100. * @param $na
  101. * Tripal expects all jobs to have at least one argument. For this function
  102. * we don't need any, so we have this dummy argument as a filler
  103. * @param $job_id
  104. */
  105. function tripal_feature_set_urls($na = NULL, $job = NULL) {
  106. // begin the transaction
  107. db_query("BEGIN");
  108. print "\nNOTE: Setting of URLs is performed using a database transaction. \n" .
  109. "If the load fails or is terminated prematurely then the entire set of \n" .
  110. "new URLs will be rolled back and no changes will be made\n\n";
  111. // get the number of records we need to set URLs for
  112. $csql = "SELECT count(*) FROM {chado_feature}";
  113. $num_nodes = db_result(db_query($csql));
  114. // calculate the interval at which we will print an update on the screen
  115. $num_set = 0;
  116. $num_per_interval = 100;
  117. // prepate the statements which will quickly add url alias. Because these
  118. // are not Chado tables we must manually prepare them
  119. $psql = "
  120. PREPARE del_url_alias_by_src (text) AS
  121. DELETE FROM {url_alias} WHERE src = \$1
  122. ";
  123. db_query($psql);
  124. $psql = "
  125. PREPARE ins_url_alias_nisrds (text, text) AS
  126. INSERT INTO url_alias (src, dst) VALUES (\$1, \$2)
  127. ";
  128. db_query($psql);
  129. // get the URL alias syntax string
  130. $url_alias = variable_get('chado_feature_url_string', '/feature/[genus]/[species]/[type]/[uniquename]');
  131. if (!$url_alias) {
  132. $url_alias = '/feature/[genus]/[species]/[type]/[uniquename]';
  133. }
  134. $url_alias = preg_replace('/^\//', '', $url_alias); // remove any preceeding forward slash
  135. // get the list of features that have been synced
  136. $sql = "SELECT * FROM {chado_feature}";
  137. $nodes = db_query($sql);
  138. while ($node = $nodes->fetchObject()) {
  139. // get the URL alias
  140. $src = "node/$node->nid";
  141. $dst = tripal_feature_get_feature_url($node, $url_alias);
  142. if (!$dst) {
  143. db_query('DEALLOCATE "del_url_alias_by_src"');
  144. db_query('DEALLOCATE "ins_url_alias_nisrds"');
  145. db_query("ROLLBACK");
  146. return;
  147. }
  148. // if the src and dst is the same (the URL alias couldn't be set)
  149. // then skip to the next one. There's nothing we can do about this one.
  150. if ($src == $dst) {
  151. continue;
  152. }
  153. // remove any previous alias and then add the new one
  154. $success = db_query("EXECUTE del_url_alias_by_src(:src)", array(':src' => $src));
  155. if (!$success) {
  156. db_query('DEALLOCATE "del_url_alias_by_src"');
  157. db_query('DEALLOCATE "ins_url_alias_nisrds"');
  158. db_query("ROLLBACK");
  159. watchdog('trp-seturl', "Failed Removing URL Alias: %src", array('%src' => $src), WATCHDOG_ERROR);
  160. return;
  161. }
  162. $success = db_query("EXECUTE ins_url_alias_nisrds(:src, :dst)", array(':src' => $src, ':dst' => $dst));
  163. if (!$success) {
  164. db_query('DEALLOCATE "del_url_alias_by_src"');
  165. db_query('DEALLOCATE "ins_url_alias_nisrds"');
  166. db_query("ROLLBACK");
  167. watchdog('trp-seturl', "Failed Adding URL Alias: %dst", array('%dst' => $dst), WATCHDOG_ERROR);
  168. return;
  169. }
  170. // update the job status every 1% features
  171. if ($job and $num_set % $num_per_interval == 0) {
  172. $percent = ($num_set / $num_nodes) * 100;
  173. tripal_job_set_progress($job, intval($percent));
  174. $percent = sprintf("%.2f", $percent);
  175. print "Setting URLs (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes.\r";
  176. }
  177. $num_set++;
  178. }
  179. $percent = ($num_set / $num_nodes) * 100;
  180. tripal_job_set_progress($job, intval($percent));
  181. $percent = sprintf("%.2f", $percent);
  182. print "Setting URLs (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes.\r";
  183. print "\nDone. Set " . number_format($num_set) . " URLs\n";
  184. // unprepare the statements
  185. db_query('DEALLOCATE "del_url_alias_by_src"');
  186. db_query('DEALLOCATE "ins_url_alias_nisrds"');
  187. db_query("COMMIT");
  188. }
  189. /**
  190. *
  191. * @param $node
  192. * A node object containing at least the feature_id and nid
  193. * @param $url_alias
  194. * Optional. This should be the URL alias syntax string that contains
  195. * placeholders such as [id], [genus], [species], [name], [uniquename],
  196. * and [type]. These placeholders will be substituted for actual values.
  197. * If this parameter is not provided then the value of the
  198. * chado_feature_url_string Drupal variable will be used.
  199. */
  200. function tripal_feature_get_feature_url($node, $url_alias = NULL) {
  201. // get the starting URL alias
  202. if (!$url_alias) {
  203. $url_alias = variable_get('chado_feature_url_string', '/feature/[genus]/[species]/[type]/[uniquename]');
  204. if (!$url_alias) {
  205. $url_alias = '/feature/[genus]/[species]/[type]/[uniquename]';
  206. }
  207. $url_alias = preg_replace('/^\//', '', $url_alias); // remove any preceeding forward slash
  208. }
  209. // get the feature
  210. $values = array('feature_id' => $node->feature_id);
  211. $options = array('statement_name' => 'sel_feature_id');
  212. $feature = tripal_core_chado_select('feature', array('*'), $values, $options);
  213. if (!$feature) {
  214. watchdog('trp-seturl', "Cannot find feature when setting URL alias for feature: %id", array('%id' => $node->feature_id), WATCHDOG_ERROR);
  215. return FALSE;
  216. }
  217. $feature = (object) $feature[0];
  218. // get the organism
  219. $values = array('organism_id' => $feature->organism_id);
  220. $options = array('statement_name' => 'sel_organism_id');
  221. $organism = tripal_core_chado_select('organism', array('*'), $values, $options);
  222. if (!$organism) {
  223. watchdog('trp-seturl', "Cannot find organism when setting URL alias for feature: %id", array('%id' => $node->feature_id), WATCHDOG_ERROR);
  224. return FALSE;
  225. }
  226. $genus = preg_replace('/\s/', '_', strtolower($organism[0]->genus));
  227. $species = preg_replace('/\s/', '_', strtolower($organism[0]->species));
  228. // get the type
  229. $values = array('cvterm_id' => $feature->type_id);
  230. $options = array('statement_name' => 'sel_cvterm_id');
  231. $cvterm = tripal_core_chado_select('cvterm', array('name'), $values, $options);
  232. if (!$cvterm) {
  233. watchdog('trp-seturl', "Cannot find type when setting URL alias for feature: %id", array('%id' => $node->feature_id), WATCHDOG_ERROR);
  234. return FALSE;
  235. }
  236. $type = preg_replace('/\s/', '_', $cvterm[0]->name);
  237. // now substitute in the values
  238. $url_alias = preg_replace('/\[id\]/', $feature->feature_id, $url_alias);
  239. $url_alias = preg_replace('/\[genus\]/', $genus, $url_alias);
  240. $url_alias = preg_replace('/\[species\]/', $species, $url_alias);
  241. $url_alias = preg_replace('/\[type\]/', $type, $url_alias);
  242. $url_alias = preg_replace('/\[name\]/', $feature->name, $url_alias);
  243. $url_alias = preg_replace('/\[uniquename\]/', $feature->uniquename, $url_alias);
  244. // the dst field of the url_alias table is only 128 characters long.
  245. // if this is the case then simply return the node URL, we can't set this one
  246. if (strlen($url_alias) > 128) {
  247. watchdog('trp-seturl', "Cannot set alias longer than 128 characters: %alias.", array('%alias' => $url_alias), WATCHDOG_ERROR);
  248. return "node/" . $node->nid;
  249. }
  250. return $url_alias;
  251. }
  252. /**
  253. *
  254. *
  255. * @ingroup tripal_feature
  256. */
  257. function tripal_feature_sync_features($max_sync = 0, $organism_id = NULL,
  258. $feature_types = NULL, $job_id = NULL) {
  259. $i = 0;
  260. // get the list of available sequence ontology terms for which
  261. // we will build drupal pages from features in chado. If a feature
  262. // is not one of the specified typse we won't build a node for it.
  263. if (!$feature_types) {
  264. $allowed_types = variable_get('chado_sync_feature_types', 'gene contig');
  265. }
  266. else {
  267. $allowed_types = $feature_types;
  268. }
  269. $allowed_types = preg_replace("/[\s\n\r]+/", " ", $allowed_types);
  270. print "Looking for features of type: $allowed_types\n";
  271. # TODO: fix the hard-coding of variables in this SQL statement
  272. $so_terms = split(' ', $allowed_types);
  273. $where_cvt = "";
  274. foreach ($so_terms as $term) {
  275. $where_cvt .= "CVT.name = '$term' OR ";
  276. }
  277. $where_cvt = drupal_substr($where_cvt, 0, drupal_strlen($where_cvt)-3); # strip trailing 'OR'
  278. // get the list of organisms that are synced and only include features from
  279. // those organisms
  280. $orgs = tripal_organism_get_synced();
  281. $where_org = "";
  282. foreach ($orgs as $org) {
  283. if ($organism_id) {
  284. if ($org->organism_id and $org->organism_id == $organism_id) {
  285. $where_org .= "F.organism_id = $org->organism_id OR ";
  286. }
  287. }
  288. else {
  289. if ($org->organism_id) {
  290. $where_org .= "F.organism_id = $org->organism_id OR ";
  291. }
  292. }
  293. }
  294. $where_org = drupal_substr($where_org, 0, drupal_strlen($where_org)-3); # strip trailing 'OR'
  295. // use this SQL statement to get the features that we're going to upload
  296. $sql = "
  297. SELECT feature_id
  298. FROM {FEATURE} F
  299. INNER JOIN {Cvterm} CVT ON F.type_id = CVT.cvterm_id
  300. INNER JOIN {CV} on CV.cv_id = CVT.cv_id
  301. WHERE ($where_cvt) AND ($where_org) AND CV.name = 'sequence'
  302. ORDER BY feature_id";
  303. // get the list of features
  304. $results = chado_query($sql);
  305. // load into ids array
  306. $count = 0;
  307. $ids = array();
  308. while ($id = $results->fetchObject()) {
  309. $ids[$count] = $id->feature_id;
  310. $count++;
  311. }
  312. // make sure our vocabularies are set before proceeding
  313. tripal_feature_set_vocabulary();
  314. // pre-create the SQL statement that will be used to check
  315. // if a feature has already been synced. We skip features
  316. // that have been synced
  317. $sql = "SELECT * FROM {chado_feature} WHERE feature_id = :feature_id";
  318. // Iterate through features that need to be synced
  319. $interval = intval($count * 0.01);
  320. if ($interval < 1) {
  321. $interval = 1;
  322. }
  323. $num_ids = sizeof($ids);
  324. $i = 0;
  325. foreach ($ids as $feature_id) {
  326. // update the job status every 1% features
  327. if ($job_id and $i % $interval == 0) {
  328. tripal_job_set_progress($job_id, intval(($i/$count)*100));
  329. }
  330. // if we have a maximum number to sync then stop when we get there
  331. // if not then just continue on
  332. if ($max_sync and $i == $max_sync) {
  333. return '';
  334. }
  335. if (!db_query($sql, array(':feature_id' => $feature_id))->fetchObject()) {
  336. # parsing all the features can cause memory overruns
  337. # we are not sure why PHP does not clean up the memory as it goes
  338. # to avoid this problem we will call this script through an
  339. # independent system call
  340. print "$i of $num_ids Syncing feature id: $feature_id\n";
  341. $cmd = "php " . drupal_get_path('module', 'tripal_feature') . "/includes/tripal_feature.sync_features.inc -f $feature_id -t chado_feature";
  342. system($cmd);
  343. }
  344. $i++;
  345. }
  346. return '';
  347. }
  348. /**
  349. *
  350. *
  351. * @ingroup tripal_feature
  352. */
  353. function tripal_feature_sync_feature($feature_id) {
  354. //print "\tSyncing feature $feature_id\n";
  355. global $user;
  356. $create_node = 1; // set to 0 if the node exists and we just sync and not create
  357. // get the accession prefix
  358. $aprefix = variable_get('chado_feature_accession_prefix', 'FID');
  359. // if we don't have a feature_id then return
  360. if (!$feature_id) {
  361. drupal_set_message(t("Please provide a feature_id to sync"));
  362. return '';
  363. }
  364. // get information about this feature
  365. $fsql = "
  366. SELECT F.feature_id, F.name, F.uniquename,O.genus,
  367. O.species,CVT.name as cvname,F.residues,F.organism_id
  368. FROM {FEATURE} F
  369. INNER JOIN {Cvterm} CVT ON F.type_id = CVT.cvterm_id
  370. INNER JOIN {Organism} O ON F.organism_id = O.organism_ID
  371. WHERE F.feature_id = :feature_id
  372. ";
  373. $feature = chado_query($fsql, array(':feature_id' => $feature_id))->fetchObject();
  374. // get the synonyms for this feature
  375. $synsql = "
  376. SELECT S.name
  377. FROM {feature_synonym} FS
  378. INNER JOIN {synonym} S on FS.synonym_id = S.synonym_id
  379. WHERE FS.feature_id = :feature_id
  380. ";
  381. $synonyms = chado_query($synsql, array(':feature_id' => $feature_id));
  382. // now add these synonyms to the feature object as a single string
  383. $synstring = '';
  384. while ($synonym = $synonyms->fetchObject()) {
  385. $synstring .= "$synonym->name\n";
  386. }
  387. $feature->synonyms = $synstring;
  388. // check to make sure that we don't have any nodes with this feature name as a title
  389. // but without a corresponding entry in the chado_feature table if so then we want to
  390. // clean up that node. (If a node is found we don't know if it belongs to our feature or
  391. // not since features can have the same name/title.)
  392. $tsql = "SELECT * FROM {node} N WHERE title = :title";
  393. $cnsql = "SELECT * FROM {chado_feature} WHERE nid = :nid";
  394. $nodes = db_query($tsql, array(':title' => $feature->name));
  395. // cycle through all nodes that may have this title
  396. while ($node = $nodes->fetchObject()) {
  397. $feature_nid = db_query($cnsql, array(':nid' => $node->nid))->fetchObject();
  398. if (!$feature_nid) {
  399. drupal_set_message(t("%feature_id: A node is present but the chado_feature entry is missing... correcting", array('%feature_id' => $feature_id)));
  400. node_delete($node->nid);
  401. }
  402. }
  403. // check if this feature already exists in the chado_feature table.
  404. // if we have a chado feature, we want to check to see if we have a node
  405. $cfsql = "SELECT * FROM {chado_feature} WHERE feature_id = :feature_id";
  406. // @coder-ignore: don't need to use db_rewrite_sql() since need all nodes regardless of access control
  407. $nsql = "SELECT * FROM {node} N WHERE nid = :nid";
  408. $chado_feature = db_query($cfsql, array(':feature_id' => $feature->feature_id))->fetchObject();
  409. if ($chado_feature) {
  410. drupal_set_message(t("%feature_id: A chado_feature entry exists", array('%feature_id' => $feature_id)));
  411. $node = db_query($nsql, array(':nid' => $chado_feature->nid))->fetchObject();
  412. if (!$node) {
  413. // if we have a chado_feature but not a node then we have a problem and
  414. // need to cleanup
  415. drupal_set_message(t("%feature_id: The node is missing, but has a chado_feature entry... correcting", array('%feature_id' => $feature_id)));
  416. $df_sql = "DELETE FROM {chado_feature} WHERE feature_id = :feature_id";
  417. db_query($df_sql, array(':feature_id' => $feature_id));
  418. }
  419. else {
  420. drupal_set_message(t("%feature_id: A corresponding node exists", array('%feature_id' => $feature_id)));
  421. $create_node = 0;
  422. }
  423. }
  424. // if we've encountered an error then just return.
  425. if ($error_msg = db_error()) {
  426. //print "$error_msg\n";
  427. return '';
  428. }
  429. // if a drupal node does not exist for this feature then we want to
  430. // create one. Note that the node_save call in this block
  431. // will call the hook_submit function which
  432. if ($create_node) {
  433. // get the organism for this feature
  434. $sql = "SELECT * FROM {organism} WHERE organism_id = :organism_id";
  435. $organism = chado_query($sql, array(':organism_id' => $feature->organism_id))->fetchObject();
  436. drupal_set_message(t("%feature_id: Creating node $feature->name", array('%feature_id' => $feature_id)));
  437. $new_node = new stdClass();
  438. $new_node->type = 'chado_feature';
  439. $new_node->uid = $user->uid;
  440. $new_node->title = "$feature->name, $feature->uniquename ($feature->cvname) $organism->genus $organism->species";
  441. $new_node->fname = "$feature->name";
  442. $new_node->uniquename = "$feature->uniquename";
  443. $new_node->feature_id = $feature->feature_id;
  444. $new_node->residues = $feature->residues;
  445. $new_node->organism_id = $feature->organism_id;
  446. $new_node->feature_type = $feature->cvname;
  447. $new_node->synonyms = $feature->synonyms;
  448. // validate the node and if okay then submit
  449. node_validate($new_node);
  450. if ($errors = form_get_errors()) {
  451. print "Error encountered validating new node. Cannot sync\n";
  452. foreach ($errors as $key => $msg) {
  453. watchdog('trp-fsync', "%msg", array('%msg' => $msg), 'error');
  454. }
  455. exit;
  456. }
  457. else {
  458. $node = node_submit($new_node);
  459. node_save($node);
  460. }
  461. }
  462. else {
  463. $node = $chado_feature;
  464. }
  465. // set the taxonomy for this node
  466. drupal_set_message(t("%feature_id ($node->nid): setting taxonomy", array('%feature_id' => $feature_id)));
  467. tripal_feature_set_taxonomy($node, $feature_id);
  468. return '';
  469. }