tripal_feature.sync_features.inc 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542
  1. <?php
  2. /**
  3. * @file
  4. * @todo Add file header description
  5. */
  6. # This script can be run as a stand-alone script to sync all the features from chado to drupal
  7. // Parameter f specifies the feature_id to sync
  8. // -f 0 will sync all features
  9. $arguments = getopt("f:t:");
  10. if (isset($arguments['f']) and isset($arguments['t']) and $arguments['t'] == 'chado_feature') {
  11. $drupal_base_url = parse_url('http://www.example.com');
  12. $_SERVER['HTTP_HOST'] = $drupal_base_url['host'];
  13. $_SERVER['REQUEST_URI'] = $_SERVER['SCRIPT_NAME'] = $_SERVER['PHP_SELF'];
  14. $_SERVER['REMOTE_ADDR'] = NULL;
  15. $_SERVER['REQUEST_METHOD'] = NULL;
  16. require_once 'includes/bootstrap.inc';
  17. drupal_bootstrap(DRUPAL_BOOTSTRAP_FULL);
  18. $feature_id = $arguments['f'];
  19. if ($feature_id > 0) {
  20. tripal_feature_sync_feature($feature_id);
  21. }
  22. else {
  23. print "syncing all features...\n";
  24. tripal_feature_sync_features();
  25. }
  26. }
  27. /**
  28. *
  29. */
  30. function tripal_feature_sync_form() {
  31. $form['description'] = array(
  32. '#type' => 'item',
  33. '#value' => t("Add feature types, optionally select an organism and ".
  34. "click the 'Sync all Features' button to create Drupal ".
  35. "content for features in chado. Only features of the types listed ".
  36. "below in the Feature Types box will be synced. You may limit the ".
  37. "features to be synced by a specific organism. Depending on the ".
  38. "number of features in the chado database this may take a long ".
  39. "time to complete. "),
  40. );
  41. $form['feature_types'] = array(
  42. '#title' => t('Feature Types'),
  43. '#type' => 'textarea',
  44. '#description' => t("Enter the names of the feature types to sync. Pages for these feature ".
  45. "types will be created automatically for features that exist in the ".
  46. "chado database. The names listed here should be spearated by ".
  47. "spaces or entered separately on new lines. The names must match ".
  48. "exactly (spelling and case) with terms in the sequence ontology"),
  49. '#required' => TRUE,
  50. '#default_value' => variable_get('chado_sync_feature_types', 'gene contig'),
  51. );
  52. // get the list of organisms
  53. $sql = "SELECT * FROM {organism} ORDER BY genus, species";
  54. $orgs = tripal_organism_get_synced();
  55. $organisms[] = '';
  56. foreach ($orgs as $organism) {
  57. $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
  58. }
  59. $form['organism_id'] = array(
  60. '#title' => t('Organism'),
  61. '#type' => t('select'),
  62. '#description' => t("Choose the organism for which features types set above will be synced. Only organisms which also have been synced will appear in this list."),
  63. '#options' => $organisms,
  64. );
  65. $form['button'] = array(
  66. '#type' => 'submit',
  67. '#value' => t('Sync all Features'),
  68. '#weight' => 3,
  69. );
  70. return $form;
  71. }
  72. /**
  73. *
  74. */
  75. function tripal_feature_sync_form_validate($form, &$form_state) {
  76. $organism_id = $form_state['values']['organism_id'];
  77. $feature_types = $form_state['values']['feature_types'];
  78. // nothing to do
  79. }
  80. /**
  81. *
  82. */
  83. function tripal_feature_sync_form_submit($form, &$form_state) {
  84. global $user;
  85. $organism_id = $form_state['values']['organism_id'];
  86. $feature_types = $form_state['values']['feature_types'];
  87. $job_args = array(0, $organism_id, $feature_types);
  88. if ($organism_id) {
  89. $organism = tripal_core_chado_select('organism', array('genus', 'species'), array('organism_id' => $organism_id));
  90. $title = "Sync all features for " . $organism[0]->genus . " " . $organism[0]->species;
  91. }
  92. else {
  93. $title = t('Sync all features for all synced organisms');
  94. }
  95. variable_set('chado_sync_feature_types', $feature_types);
  96. tripal_add_job($title, 'tripal_feature', 'tripal_feature_sync_features', $job_args, $user->uid);
  97. }
  98. /**
  99. *
  100. * @param $na
  101. * Tripal expects all jobs to have at least one argument. For this function
  102. * we don't need any, so we have this dummy argument as a filler
  103. * @param $job_id
  104. */
  105. function tripal_feature_set_urls($na = NULL, $job = NULL) {
  106. // begin the transaction
  107. db_query("BEGIN");
  108. print "\nNOTE: Setting of URLs is performed using a database transaction. \n" .
  109. "If the load fails or is terminated prematurely then the entire set of \n" .
  110. "new URLs will be rolled back and no changes will be made\n\n";
  111. // get the number of records we need to set URLs for
  112. $csql = "SELECT count(*) FROM {chado_feature}";
  113. $num_nodes = db_result(db_query($csql));
  114. // calculate the interval at which we will print an update on the screen
  115. $num_set = 0;
  116. $num_per_interval = 100;
  117. // prepate the statements which will quickly add url alias. Because these
  118. // are not Chado tables we must manually prepare them
  119. $psql = "
  120. PREPARE del_url_alias_by_src (text) AS
  121. DELETE FROM {url_alias} WHERE src = \$1
  122. ";
  123. db_query($psql);
  124. $psql = "
  125. PREPARE ins_url_alias_nisrds (text, text) AS
  126. INSERT INTO url_alias (src, dst) VALUES (\$1, \$2)
  127. ";
  128. db_query($psql);
  129. // get the URL alias syntax string
  130. $url_alias = variable_get('chado_feature_url_string', '/feature/[genus]/[species]/[type]/[uniquename]');
  131. if (!$url_alias) {
  132. $url_alias = '/feature/[genus]/[species]/[type]/[uniquename]';
  133. }
  134. $url_alias = preg_replace('/^\//', '', $url_alias); // remove any preceeding forward slash
  135. // get the list of features that have been synced
  136. $sql = "SELECT * FROM {chado_feature}";
  137. $nodes = db_query($sql);
  138. while ($node = db_fetch_object($nodes)) {
  139. // get the URL alias
  140. $src = "node/$node->nid";
  141. $dst = tripal_feature_get_feature_url($node, $url_alias);
  142. if (!$dst) {
  143. db_query('DEALLOCATE "del_url_alias_by_src"');
  144. db_query('DEALLOCATE "ins_url_alias_nisrds"');
  145. db_query("ROLLBACK");
  146. return;
  147. }
  148. // if the src and dst is the same (the URL alias couldn't be set)
  149. // then skip to the next one. There's nothing we can do about this one.
  150. if($src == $dst) {
  151. continue;
  152. }
  153. // remove any previous alias and then add the new one
  154. $success = db_query("EXECUTE del_url_alias_by_src('%s')", $src);
  155. if (!$success) {
  156. db_query('DEALLOCATE "del_url_alias_by_src"');
  157. db_query('DEALLOCATE "ins_url_alias_nisrds"');
  158. db_query("ROLLBACK");
  159. watchdog('trp-seturl', "Failed Removing URL Alias: %src", array('%src' => $src), WATCHDOG_ERROR);
  160. return;
  161. }
  162. $success = db_query("EXECUTE ins_url_alias_nisrds('%s', '%s')", $src, $dst);
  163. if (!$success) {
  164. db_query('DEALLOCATE "del_url_alias_by_src"');
  165. db_query('DEALLOCATE "ins_url_alias_nisrds"');
  166. db_query("ROLLBACK");
  167. watchdog('trp-seturl', "Failed Adding URL Alias: %dst", array('%dst' => $dst), WATCHDOG_ERROR);
  168. return;
  169. }
  170. // update the job status every 1% features
  171. if ($job and $num_set % $num_per_interval == 0) {
  172. $percent = ($num_set / $num_nodes) * 100;
  173. tripal_job_set_progress($job, intval($percent));
  174. $percent = sprintf("%.2f", $percent);
  175. print "Setting URLs (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes.\r";
  176. }
  177. $num_set++;
  178. }
  179. $percent = ($num_set / $num_nodes) * 100;
  180. tripal_job_set_progress($job, intval($percent));
  181. $percent = sprintf("%.2f", $percent);
  182. print "Setting URLs (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes.\r";
  183. print "\nDone. Set " . number_format($num_set) . " URLs\n";
  184. // unprepare the statements
  185. db_query('DEALLOCATE "del_url_alias_by_src"');
  186. db_query('DEALLOCATE "ins_url_alias_nisrds"');
  187. db_query("COMMIT");
  188. }
  189. /**
  190. *
  191. * @param $node
  192. * A node object containing at least the feature_id and nid
  193. * @param $url_alias
  194. * Optional. This should be the URL alias syntax string that contains
  195. * placeholders such as [id], [genus], [species], [name], [uniquename],
  196. * and [type]. These placeholders will be substituted for actual values.
  197. * If this parameter is not provided then the value of the
  198. * chado_feature_url_string Drupal variable will be used.
  199. */
  200. function tripal_feature_get_feature_url($node, $url_alias = NULL) {
  201. // get the starting URL alias
  202. if(!$url_alias) {
  203. $url_alias = variable_get('chado_feature_url_string', '/feature/[genus]/[species]/[type]/[uniquename]');
  204. if (!$url_alias) {
  205. $url_alias = '/feature/[genus]/[species]/[type]/[uniquename]';
  206. }
  207. $url_alias = preg_replace('/^\//', '', $url_alias); // remove any preceeding forward slash
  208. }
  209. // get the feature
  210. $values = array('feature_id' => $node->feature_id);
  211. $options = array('statement_name' => 'sel_feature_id');
  212. $feature = tripal_core_chado_select('feature', array('*'), $values, $options);
  213. if (!$feature) {
  214. watchdog('trp-seturl', "Cannot find feature when setting URL alias for feature: %id", array('%id' => $node->feature_id), WATCHDOG_ERROR);
  215. return FALSE;
  216. }
  217. $feature = (object) $feature[0];
  218. // get the organism
  219. $values = array('organism_id' => $feature->organism_id);
  220. $options = array('statement_name' => 'sel_organism_id');
  221. $organism = tripal_core_chado_select('organism', array('*'), $values, $options);
  222. if (!$organism) {
  223. watchdog('trp-seturl', "Cannot find organism when setting URL alias for feature: %id", array('%id' => $node->feature_id), WATCHDOG_ERROR);
  224. return FALSE;
  225. }
  226. $genus = preg_replace('/\s/', '_', strtolower($organism[0]->genus));
  227. $species = preg_replace('/\s/', '_', strtolower($organism[0]->species));
  228. // get the type
  229. $values = array('cvterm_id' => $feature->type_id);
  230. $options = array('statement_name' => 'sel_cvterm_id');
  231. $cvterm = tripal_core_chado_select('cvterm', array('name'), $values, $options);
  232. if (!$cvterm) {
  233. watchdog('trp-seturl', "Cannot find type when setting URL alias for feature: %id", array('%id' => $node->feature_id), WATCHDOG_ERROR);
  234. return FALSE;
  235. }
  236. $type = preg_replace('/\s/', '_', $cvterm[0]->name);
  237. // now substitute in the values
  238. $url_alias = preg_replace('/\[id\]/', $feature->feature_id, $url_alias);
  239. $url_alias = preg_replace('/\[genus\]/', $genus, $url_alias);
  240. $url_alias = preg_replace('/\[species\]/', $species, $url_alias);
  241. $url_alias = preg_replace('/\[type\]/', $type, $url_alias);
  242. $url_alias = preg_replace('/\[name\]/', $feature->name, $url_alias);
  243. $url_alias = preg_replace('/\[uniquename\]/', $feature->uniquename, $url_alias);
  244. // the dst field of the url_alias table is only 128 characters long.
  245. // if this is the case then simply return the node URL, we can't set this one
  246. if (strlen($url_alias) > 128) {
  247. watchdog('trp-seturl', "Cannot set alias longer than 128 characters: %alias.", array('%alias' => $url_alias), WATCHDOG_ERROR);
  248. return "node/" . $node->nid;
  249. }
  250. return $url_alias;
  251. }
  252. /**
  253. *
  254. *
  255. * @ingroup tripal_feature
  256. */
  257. function tripal_feature_sync_features($max_sync = 0, $organism_id = NULL,
  258. $feature_types = NULL, $job_id = NULL) {
  259. $i = 0;
  260. // get the list of available sequence ontology terms for which
  261. // we will build drupal pages from features in chado. If a feature
  262. // is not one of the specified typse we won't build a node for it.
  263. if (!$feature_types) {
  264. $allowed_types = variable_get('chado_sync_feature_types', 'gene contig');
  265. }
  266. else {
  267. $allowed_types = $feature_types;
  268. }
  269. $allowed_types = preg_replace("/[\s\n\r]+/", " ", $allowed_types);
  270. print "Looking for features of type: $allowed_types\n";
  271. $so_terms = split(' ', $allowed_types);
  272. $where_cvt = "";
  273. foreach ($so_terms as $term) {
  274. $where_cvt .= "CVT.name = '$term' OR ";
  275. }
  276. $where_cvt = drupal_substr($where_cvt, 0, drupal_strlen($where_cvt)-3); # strip trailing 'OR'
  277. // get the list of organisms that are synced and only include features from
  278. // those organisms
  279. $orgs = tripal_organism_get_synced();
  280. $where_org = "";
  281. foreach ($orgs as $org) {
  282. if ($organism_id) {
  283. if ($org->organism_id and $org->organism_id == $organism_id) {
  284. $where_org .= "F.organism_id = $org->organism_id OR ";
  285. }
  286. }
  287. else {
  288. if ($org->organism_id) {
  289. $where_org .= "F.organism_id = $org->organism_id OR ";
  290. }
  291. }
  292. }
  293. $where_org = drupal_substr($where_org, 0, drupal_strlen($where_org)-3); # strip trailing 'OR'
  294. // use this SQL statement to get the features that we're going to upload
  295. $sql = "SELECT feature_id ".
  296. "FROM {FEATURE} F ".
  297. " INNER JOIN {Cvterm} CVT ON F.type_id = CVT.cvterm_id ".
  298. " INNER JOIN {CV} on CV.cv_id = CVT.cv_id ".
  299. "WHERE ($where_cvt) AND ($where_org) AND CV.name = 'sequence' ".
  300. "ORDER BY feature_id";
  301. // get the list of features
  302. $results = chado_query($sql);
  303. // load into ids array
  304. $count = 0;
  305. $ids = array();
  306. while ($id = db_fetch_object($results)) {
  307. $ids[$count] = $id->feature_id;
  308. $count++;
  309. }
  310. // make sure our vocabularies are set before proceeding
  311. tripal_feature_set_vocabulary();
  312. // pre-create the SQL statement that will be used to check
  313. // if a feature has already been synced. We skip features
  314. // that have been synced
  315. $sql = "SELECT * FROM {chado_feature} WHERE feature_id = %d";
  316. // Iterate through features that need to be synced
  317. $interval = intval($count * 0.01);
  318. if ($interval < 1) {
  319. $interval = 1;
  320. }
  321. $num_ids = sizeof($ids);
  322. $i = 0;
  323. foreach ($ids as $feature_id) {
  324. // update the job status every 1% features
  325. if ($job_id and $i % $interval == 0) {
  326. tripal_job_set_progress($job_id, intval(($i/$count)*100));
  327. }
  328. // if we have a maximum number to sync then stop when we get there
  329. // if not then just continue on
  330. if ($max_sync and $i == $max_sync) {
  331. return '';
  332. }
  333. if (!db_fetch_object(db_query($sql, $feature_id))) {
  334. # parsing all the features can cause memory overruns
  335. # we are not sure why PHP does not clean up the memory as it goes
  336. # to avoid this problem we will call this script through an
  337. # independent system call
  338. print "$i of $num_ids Syncing feature id: $feature_id\n";
  339. $cmd = "php " . drupal_get_path('module', 'tripal_feature') . "/includes/tripal_feature.sync_features.inc -f $feature_id -t chado_feature";
  340. system($cmd);
  341. }
  342. $i++;
  343. }
  344. return '';
  345. }
  346. /**
  347. *
  348. *
  349. * @ingroup tripal_feature
  350. */
  351. function tripal_feature_sync_feature($feature_id) {
  352. //print "\tSyncing feature $feature_id\n";
  353. global $user;
  354. $create_node = 1; // set to 0 if the node exists and we just sync and not create
  355. // get the accession prefix
  356. $aprefix = variable_get('chado_feature_accession_prefix', 'FID');
  357. // if we don't have a feature_id then return
  358. if (!$feature_id) {
  359. drupal_set_message(t("Please provide a feature_id to sync"));
  360. return '';
  361. }
  362. // get information about this feature
  363. $fsql = "SELECT F.feature_id, F.name, F.uniquename,O.genus, ".
  364. " O.species,CVT.name as cvname,F.residues,F.organism_id ".
  365. "FROM {FEATURE} F ".
  366. " INNER JOIN {Cvterm} CVT ON F.type_id = CVT.cvterm_id ".
  367. " INNER JOIN {Organism} O ON F.organism_id = O.organism_ID ".
  368. "WHERE F.feature_id = %d";
  369. $feature = db_fetch_object(chado_query($fsql, $feature_id));
  370. // get the synonyms for this feature
  371. $synsql = "SELECT S.name ".
  372. "FROM {feature_synonym} FS ".
  373. " INNER JOIN {synonym} S on FS.synonym_id = S.synonym_id ".
  374. "WHERE FS.feature_id = %d";
  375. $synonyms = chado_query($synsql, $feature_id);
  376. // now add these synonyms to the feature object as a single string
  377. $synstring = '';
  378. while ($synonym = db_fetch_object($synonyms)) {
  379. $synstring .= "$synonym->name\n";
  380. }
  381. $feature->synonyms = $synstring;
  382. // check to make sure that we don't have any nodes with this feature name as a title
  383. // but without a corresponding entry in the chado_feature table if so then we want to
  384. // clean up that node. (If a node is found we don't know if it belongs to our feature or
  385. // not since features can have the same name/title.)
  386. $tsql = "SELECT * FROM {node} N ".
  387. "WHERE title = '%s'";
  388. $cnsql = "SELECT * FROM {chado_feature} ".
  389. "WHERE nid = %d";
  390. $nodes = db_query($tsql, $feature->name);
  391. // cycle through all nodes that may have this title
  392. while ($node = db_fetch_object($nodes)) {
  393. $feature_nid = db_fetch_object(db_query($cnsql, $node->nid));
  394. if (!$feature_nid) {
  395. drupal_set_message(t("%feature_id: A node is present but the chado_feature entry is missing... correcting", array('%feature_id' => $feature_id)));
  396. node_delete($node->nid);
  397. }
  398. }
  399. // check if this feature already exists in the chado_feature table.
  400. // if we have a chado feature, we want to check to see if we have a node
  401. $cfsql = "SELECT * FROM {chado_feature} ".
  402. "WHERE feature_id = %d";
  403. // @coder-ignore: don't need to use db_rewrite_sql() since need all nodes regardless of access control
  404. $nsql = "SELECT * FROM {node} N ".
  405. "WHERE nid = %d";
  406. $chado_feature = db_fetch_object(db_query($cfsql, $feature->feature_id));
  407. if ($chado_feature) {
  408. drupal_set_message(t("%feature_id: A chado_feature entry exists", array('%feature_id' => $feature_id)));
  409. $node = db_fetch_object(db_query($nsql, $chado_feature->nid));
  410. if (!$node) {
  411. // if we have a chado_feature but not a node then we have a problem and
  412. // need to cleanup
  413. drupal_set_message(t("%feature_id: The node is missing, but has a chado_feature entry... correcting", array('%feature_id' => $feature_id)));
  414. $df_sql = "DELETE FROM {chado_feature} WHERE feature_id = %d";
  415. db_query($df_sql, $feature_id);
  416. }
  417. else {
  418. drupal_set_message(t("%feature_id: A corresponding node exists", array('%feature_id' => $feature_id)));
  419. $create_node = 0;
  420. }
  421. }
  422. // if we've encountered an error then just return.
  423. if ($error_msg = db_error()) {
  424. //print "$error_msg\n";
  425. return '';
  426. }
  427. // if a drupal node does not exist for this feature then we want to
  428. // create one. Note that the node_save call in this block
  429. // will call the hook_submit function which
  430. if ($create_node) {
  431. // get the organism for this feature
  432. $sql = "SELECT * FROM {organism} WHERE organism_id = %d";
  433. $organism = db_fetch_object(chado_query($sql, $feature->organism_id));
  434. drupal_set_message(t("%feature_id: Creating node $feature->name", array('%feature_id' => $feature_id)));
  435. $new_node = new stdClass();
  436. $new_node->type = 'chado_feature';
  437. $new_node->uid = $user->uid;
  438. $new_node->title = "$feature->name, $feature->uniquename ($feature->cvname) $organism->genus $organism->species";
  439. $new_node->fname = "$feature->name";
  440. $new_node->uniquename = "$feature->uniquename";
  441. $new_node->feature_id = $feature->feature_id;
  442. $new_node->residues = $feature->residues;
  443. $new_node->organism_id = $feature->organism_id;
  444. $new_node->feature_type = $feature->cvname;
  445. $new_node->synonyms = $feature->synonyms;
  446. // validate the node and if okay then submit
  447. node_validate($new_node);
  448. if ($errors = form_get_errors()) {
  449. print "Error encountered validating new node. Cannot sync\n";
  450. foreach ($errors as $key => $msg) {
  451. watchdog('trp-fsync', "%msg", array('%msg' => $msg), 'error');
  452. }
  453. exit;
  454. }
  455. else {
  456. $node = node_submit($new_node);
  457. node_save($node);
  458. }
  459. }
  460. else {
  461. $node = $chado_feature;
  462. }
  463. // set the taxonomy for this node
  464. drupal_set_message(t("%feature_id ($node->nid): setting taxonomy", array('%feature_id' => $feature_id)));
  465. tripal_feature_set_taxonomy($node, $feature_id);
  466. return '';
  467. }