tripal_bulk_loader.loader.inc 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. <?php
  2. /**
  3. * Add Loader Job Form
  4. *
  5. * This form is meant to be included on the node page to allow users to submit/re-submit
  6. * loading jobs
  7. */
  8. function tripal_bulk_loader_add_loader_job_form ($form_state, $node) {
  9. $form = array();
  10. $form['nid'] = array(
  11. '#type' => 'hidden',
  12. '#value' => $node->nid,
  13. );
  14. $form['file'] = array(
  15. '#type' => 'hidden',
  16. '#value' => $node->file
  17. );
  18. $form['job_id'] = array(
  19. '#type' => 'hidden',
  20. '#value' => $node->job_id,
  21. );
  22. $form['submit'] = array(
  23. '#type' => 'submit',
  24. '#value' => ($node->job_id) ? 'Re-Submit Job' : 'Submit Job',
  25. );
  26. $form['submit-cancel'] = array(
  27. '#type' => ($node->job_id)? 'submit' : 'hidden',
  28. '#value' => 'Cancel Job',
  29. );
  30. return $form;
  31. }
  32. /**
  33. * Add Loader Job Form (Submit)
  34. */
  35. function tripal_bulk_loader_add_loader_job_form_submit ($form, $form_state) {
  36. global $user;
  37. if (preg_match('/Submit Job/', $form_state['values']['op'])) {
  38. //Submit Tripal Job
  39. $job_args[1] = $form_state['values']['nid'];
  40. if (is_readable($form_state['values']['file'])) {
  41. $fname = basename($form_state['values']['file']);
  42. $job_id = tripal_add_job("Bulk Loading Job: $fname",'tripal_bulk_loader', 'tripal_bulk_loader_load_data', $job_args, $user->uid);
  43. // add job_id to bulk_loader node
  44. $success = db_query("UPDATE {tripal_bulk_loader} SET job_id=%d WHERE nid=%d", $job_id, $form_state['values']['nid']);
  45. } else {
  46. drupal_set_message("Can not open ".$form_state['values']['file'].". Job not scheduled.");
  47. }
  48. } elseif (preg_match('/Re-Submit Job/', $form_state['values']['op'])) {
  49. tripal_jobs_rerun($form_state['values']['job_id']);
  50. } elseif (preg_match('/Cancel Job/', $form_state['values']['op'])) {
  51. tripal_jobs_cancel($form_state['values']['job_id']);
  52. }
  53. }
  54. /**
  55. * Tripal Bulk Loader
  56. *
  57. * This is the function that's run by tripal_launch_jobs to bulk load chado data.
  58. * @param $nid
  59. * The Node ID of the bulk loading job node to be loaded. All other needed data is expected to be
  60. * in the node (ie: template ID and file)
  61. *
  62. * Note: Instead of returning a value this function updates the tripal_bulk_loader.status and
  63. * Enters errors into tripal_bulk_loader_errors if they are encountered.
  64. */
  65. function tripal_bulk_loader_load_data($nid) {
  66. $node = node_load($nid);
  67. print "Template: ".$node->template->name." (".$node->template_id.")\n";
  68. print "File: ".$node->file."\n";
  69. // Prep Work ==================================================================================
  70. // Generate default values array
  71. $default_data = array();
  72. $field2column = array();
  73. $record2priority = array();
  74. foreach ($node->template->template_array as $priority => $record_array) {
  75. if (!is_array($record_array)) { continue; }
  76. foreach ($record_array['fields'] as $field_index => $field_array) {
  77. $default_data[$priority]['table'] = $record_array['table'];
  78. $default_data[$priority]['mode'] = ($record_array['mode']) ? $record_array['mode'] : 'insert';
  79. $default_data[$priority]['record_id'] = $record_array['record_id'];
  80. $record2priority[$record_array['record_id']] = $priority;
  81. $default_data[$priority]['required'][$field_array['field']] = $field_array['required'];
  82. if (isset($field_array['regex'])) {
  83. $default_data[$priority]['regex_transform'][$field_array['field']] = $field_array['regex'];
  84. }
  85. if (preg_match('/table field/', $field_array['type'])) {
  86. $default_data[$priority]['values_array'][$field_array['field']] = '';
  87. $default_data[$priority]['need_further_processing'] = TRUE;
  88. $field2column[$priority][$field_array['field']] = $field_array['spreadsheet column'];
  89. } elseif (preg_match('/constant/', $field_array['type'])) {
  90. $default_data[$priority]['values_array'][$field_array['field']] = $field_array['constant value'];
  91. } elseif (preg_match('/foreign key/', $field_array['type'])) {
  92. $default_data[$priority]['values_array'][$field_array['field']] = array();
  93. $default_data[$priority]['values_array'][$field_array['field']]['foreign record'] = $field_array['foreign key'];
  94. $default_data[$priority]['need_further_processing'] = TRUE;
  95. } else {
  96. print 'WARNING: Unsupported type: '. $field_array['type'] . ' for ' . $table . '.' . $field_array['field']."!\n";
  97. }
  98. } // end of foreach field
  99. } //end of foreach record
  100. //print "\nDefault Values Array: ".print_r($default_data, TRUE)."\n";
  101. //print "\nField to Column Mapping: ".print_r($field2column, TRUE)."\n";
  102. // Parse File adding records as we go ========================================================
  103. $file_handle = fopen($node->file, 'r');
  104. if (preg_match('/(t|true|1)/', $node->file_has_header)) { fgets($file_handle, 4096); }
  105. $num_records = 0;
  106. $num_lines = 0;
  107. $num_errors = 0;
  108. while (!feof($file_handle)) {
  109. $line = array();
  110. $raw_line = fgets($file_handle, 4096);
  111. $raw_line = trim($raw_line);
  112. if (preg_match('/^\s*$/', $raw_line)) { continue; } // skips blank lines
  113. $line = preg_split("/\t/", $raw_line);
  114. $num_lines++;
  115. $data = $default_data;
  116. foreach ($data as $priority => $table_data) {
  117. $table = $table_data['table'];
  118. $values = $table_data['values_array'];
  119. //print 'Table: '.$table."\n";
  120. //print 'Default Values:'.print_r($values,TRUE)."\n";
  121. if ($table_data['need_further_processing']) {
  122. $values = tripal_bulk_loader_add_spreadsheetdata_to_values ($values, $line, $field2column[$priority]);
  123. //print 'W/ Spreadsheet Values:'.print_r($values,TRUE)."\n";
  124. $values = tripal_bulk_loader_add_foreignkey_to_values($values, $data, $record2priority);
  125. //print 'W/ Foreign Key Values:'.print_r($values,TRUE)."\n";
  126. }
  127. $values = tripal_bulk_loader_regex_tranform_values($values, $table_data, $line);
  128. //print 'W/ Regex Values:'.print_r($values,TRUE)."\n";
  129. if (!$values) {
  130. $msg = $table_data['record_id'].' ('.$table_data['mode'].') Aborted due to error in previous record.';
  131. watchdog('T_bulk_loader', $msg, array(), WATCHDOG_WARNING);
  132. print "ERROR: ".$msg."\n";
  133. $data[$priority]['error'] = TRUE;
  134. }
  135. if (preg_match('/optional/', $table_array['mode'])) {
  136. // Check all required fields are set
  137. $table_desc = module_invoke_all('chado_'.$table.'_schema');
  138. $fields = $table_desc['fields'];
  139. foreach($fields as $field => $def){
  140. // a field is considered missing if it cannot be null and there is no default
  141. // value for it or it is of type 'serial'
  142. if($def['not null'] == 1 and !array_key_exists($field,$insert_values) and !isset($def['default']) and strcmp($def['type'],serial)!=0){
  143. $msg = $table_data['record_id'].' ('.$table_data['mode'].') Missing Database Required Value: '.$table.'.'.$field;
  144. watchdog('T_bulk_loader', $msg, array(), WATCHDOG_WARNING);
  145. print "ERROR: ".$msg."\n";
  146. $data[$priority]['error'] = TRUE;
  147. }
  148. }
  149. } //end of if optional record
  150. // Check required fields are present
  151. foreach ($table_data['required'] as $field => $required) {
  152. if ($required) {
  153. if (!isset($values[$field])) {
  154. $msg = $table_data['record_id'].' ('.$table_data['mode'].') Missing Template Required Value: '.$table.'.'.$field;
  155. watchdog('T_bulk_loader', $msg, array(), WATCHDOG_WARNING);
  156. print "ERROR: ".$msg."\n";
  157. $data[$priority]['error'] = TRUE;
  158. }
  159. }
  160. }
  161. if ($data[$priority]['error']) {
  162. // print debugging information
  163. print 'Default Table Data: '.print_r($table_data,TRUE)."\n";
  164. print 'Supplemented Values: '.print_r($values,TRUE)."\n";
  165. // continue to next record
  166. continue;
  167. }
  168. // add new values array into the data array
  169. $data[$priority]['values_array'] = $values;
  170. // check if it is already inserted
  171. if ($table_data['inserted']) {
  172. continue;
  173. }
  174. // first check if it already exists
  175. $exists = tripal_core_chado_select($table, array_keys($values), $values, array('has_record'=>TRUE));
  176. //print "Number of Records:".$exists.".\n";
  177. if ($exists === 1) {
  178. if (!preg_match('/select/',$table_data['mode'])) {
  179. $msg = $table_data['record_id'].' ('.$table_data['mode'].') Record already exists in '.$table.' where values:'.print_r($values,TRUE);
  180. watchdog('T_bulk_loader', $msg, array(), WATCHDOG_WARNING);
  181. print "ERROR: ".$msg."\n";
  182. $data[$priority]['error'] = TRUE;
  183. }
  184. } elseif ($exists > 1) {
  185. $msg = $table_data['record_id'].' ('.$table_data['mode'].') 2+ Records exists in '.$table.' where values:'.print_r($values,TRUE);
  186. watchdog('T_bulk_loader', $msg, array(), WATCHDOG_WARNING);
  187. print "ERROR: ".$msg."\n";
  188. $data[$priority]['error'] = TRUE;
  189. } else {
  190. // if it doesn't exist already then insert it
  191. if (!preg_match('/select/',$table_data['mode'])) {
  192. $success = tripal_core_chado_insert($table, $values);
  193. if (!$success) {
  194. $msg = $table_data['record_id'].' ('.$table_data['mode'].') Unable to insert record into '.$table.' where values:'.print_r($values,TRUE);
  195. watchdog('T_bulk_loader', $msg, array(), WATCHDOG_ERROR);
  196. print "ERROR: ".$msg."\n";
  197. $data[$priority]['error'] = TRUE;
  198. } else {
  199. // if mode=insert_once then ensure we only insert it once
  200. if (preg_match('/insert_once/',$table_data['mode'])) {
  201. $default_data[$priority]['inserted'] = TRUE;
  202. }
  203. }
  204. } else {
  205. $msg = $table_data['record_id'].' ('.$table_data['mode'].') No Matching record in '.$table.' where values:'.print_r($values,TRUE);
  206. watchdog('T_bulk_loader', $msg, array(), WATCHDOG_WARNING);
  207. print "ERROR: ".$msg."\n";
  208. $data[$priority]['error'] = TRUE;
  209. }
  210. }// end of if/not record exists
  211. if ($data[$priority]['error']) {
  212. // print debugging information
  213. print 'Table Data: '.print_r($table_data,TRUE)."\n";
  214. print 'Default Table Data: '.print_r($default_data[$priority],TRUE)."\n";
  215. print 'Supplemented Values: '.print_r($values,TRUE)."\n";
  216. }
  217. } // end of foreach table in default data array
  218. } //end of foreach line of file
  219. }
  220. /**
  221. * This function adds the file data to the values array
  222. *
  223. * @param $values
  224. * The default values array -contains all constants
  225. * @param $line
  226. * An array of values for the current line
  227. * @param $field2column
  228. * An array mapping values fields to line columns
  229. * @return
  230. * Supplemented values array
  231. */
  232. function tripal_bulk_loader_add_spreadsheetdata_to_values ($values, $line, $field2column) {
  233. foreach ($values as $field => $value) {
  234. if (is_array($value)) { continue; }
  235. $column = $field2column[$field] - 1;
  236. if ($column < 0) { continue; }
  237. if (preg_match('/\w+/',$line[$column])) {
  238. $values[$field] = $line[$column];
  239. } else {
  240. unset($values[$field]);
  241. }
  242. }
  243. return $values;
  244. }
  245. /**
  246. * Handles foreign keys in the values array.
  247. *
  248. * Specifically, if the value for a field is an array then it is assumed that the array contains
  249. * the name of the record whose values array should be substituted here. Thus the foreign
  250. * record is looked up and the values array is substituted in.
  251. *
  252. */
  253. function tripal_bulk_loader_add_foreignkey_to_values($values, $data, $record2priority) {
  254. foreach ($values as $field => $value) {
  255. if (is_array($value)) {
  256. $foreign_record = $value['foreign record'];
  257. $foreign_priority = $record2priority[$foreign_record];
  258. $foreign_values = $data[$foreign_priority]['values_array'];
  259. // add to current values array
  260. // only if there was no error found when the foreign record was processed
  261. if (isset($data[$foreign_priority]['error']) and $data[$foreign_priority]['error']) {
  262. return FALSE;
  263. } else {
  264. $values[$field] = $foreign_values;
  265. }
  266. }
  267. }
  268. return $values;
  269. }
  270. /**
  271. * Uses a supplied regex to transform spreadsheet values
  272. *
  273. * @param $values
  274. * The select/insert values array for the given table
  275. * @param $table_data
  276. * The data array for the given table
  277. */
  278. function tripal_bulk_loader_regex_tranform_values ($values, $table_data, $line) {
  279. foreach ($table_data['regex_transform'] as $field => $regex_array) {
  280. if (!is_array($regex_array['replace'])) { continue; }
  281. // Check for <#column:\d+#> notation
  282. // if present replace with that column in the current line
  283. foreach ($regex_array['replace'] as $key => $replace) {
  284. if (preg_match_all('/<#column:(\d+)#>/', $replace, $matches)) {
  285. foreach ($matches[1] as $k => $column_num) {
  286. $replace = preg_replace('/'.$matches[0][$k].'/', $line[$column_num-1], $replace);
  287. }
  288. $regex_array['replace'][$key] = $replace;
  289. }
  290. }
  291. // do the full replacement
  292. $old_value = $values[$field];
  293. $new_value = preg_replace($regex_array['pattern'], $regex_array['replace'], $old_value);
  294. $values[$field] = $new_value;
  295. }
  296. return $values;
  297. }
  298. /**
  299. * Flattens an array up to two levels
  300. * Used for printing of arrays without taking up much space
  301. */
  302. function tripal_bulk_loader_flatten_array ($values) {
  303. $flattened_values = array();
  304. foreach ($values as $k => $v) {
  305. if (is_array($v)) {
  306. $vstr = array();
  307. foreach ($v as $vk => $vv) {
  308. if (strlen($vv) > 20) {
  309. $vstr[] = $vk .'=>'. substr($vv, 0, 20) . '...';
  310. } else {
  311. $vstr[] = $vk .'=>'. $vv;
  312. }
  313. }
  314. $v = '{'. implode(',',$vstr) .'}';
  315. } elseif (strlen($v) > 20) {
  316. $v = substr($v, 0, 20) . '...';
  317. }
  318. $flattened_values[] = $k .'=>'. $v;
  319. }
  320. return implode(', ',$flattened_values);
  321. }