Browse Source

Publishing: switched to 500record chunks for limiting transaction length.

Lacey Sanderson 6 years ago
parent
commit
d5dd0c51f7
1 changed files with 109 additions and 88 deletions
  1. 109 88
      tripal_chado/api/tripal_chado.api.inc

+ 109 - 88
tripal_chado/api/tripal_chado.api.inc

@@ -68,6 +68,11 @@ function chado_publish_records($values, $job_id = NULL) {
   $filters = array_key_exists('filters', $values) ? $values['filters'] : array();
   $sync_node = array_key_exists('sync_node', $values) ? $values['sync_node'] : '';
 
+  // We want to break the number of records to publish into chunks in order to ensure
+  // transactions do not run for too long (performance issue). The number of records
+  // to be processed per chunk is set here:
+  $chunk_size = 500;
+
   // @performance remove after development: 0.00059294700622559s
 
   // Load the bundle entity so we can get information about which Chado
@@ -200,108 +205,124 @@ function chado_publish_records($values, $job_id = NULL) {
   $sql = "SELECT count(*) as num_records " . $from . $where;
   $result = chado_query($sql, $args);
   $count = $result->fetchField();
-
-  // calculate the interval for updates
-  $interval = intval($count / 50);
-  if ($interval < 1) {
-    $interval = 1;
-  }
+  print "\nThere are $count records to publish.\n";
 
   // @performance remove after development:0.25212502479553s
-  print 'Count amount to do :' . (microtime(true) - $started_at) . "s.\n";
+  // @performance print 'Count amount to do :' . (microtime(true) - $started_at) . "s.\n";
+
+  print "\nNOTE: publishing records is performed using database transactions. If the job fails\n" .
+          "or is terminated prematurely then the current set of $chunk_size is rolled back with\n" .
+          "no changes to the database. Simply re-run the publishing job to publish any remaining\n".
+          "content after fixing the issue that caused the job to fail.\n\n" .
+          "Also, the following progress only updates every $chunk_size records.\n";
 
   // Perform the query.
-  $sql = $select . $from . $where;
-  $records = chado_query($sql, $args);
-
-  // @performance remove after development:0.43729090690613s
-  print 'Perform Query :' . (microtime(true) - $started_at) . "s.\n";
-
-  // @performance evaluate this transaction. Long running transactions can have serious
-  // performance issues in PostgreSQL. One option is to move the transaction within the
-  // loop so that each one is not very long but then we end up with more overhead creating
-  // transactions. A better albeit more complicated approach might be to break the job into
-  // chunks where each one is a single transaction.
-  $transaction = db_transaction();
-
-  print "\nNOTE: publishing records is performed using a database transaction. \n" .
-      "If the load fails or is terminated prematurely then the entire set of \n" .
-      "is rolled back with no changes to the database\n\n";
-
-  $i = 0;
-  printf("%d of %d records. (%0.2f%%) Memory: %s bytes\r", $i, $count, 0, number_format(memory_get_usage()));
-  try {
-    while($record = $records->fetchObject()) {
-
-      // @performance remove after development
-      print 'Start current entity :' . (microtime(true) - $started_at) . "s.\n";
-
-      // update the job status every interval
-      if ($i % $interval == 0) {
-        $complete = ($i / $count) * 33.33333333;
-        // Currently don't support setting job progress within a transaction.
-        // if ($report_progress) { $job->setProgress(intval($complete * 3)); }
-        printf("%d of %d records. (%0.2f%%) Memory: %s bytes\r", $i, $count, $complete * 3, number_format(memory_get_usage()));
-      }
+  $sql = $select . $from . $where . ' LIMIT '.$chunk_size;
+  $more_records_to_publish = TRUE;
+  $total_published = 0;
+  while ($more_records_to_publish) {
+
+    // @performance remove after development:0.43729090690613s
+    // @performance limiting this query DRASTICALLY decreases query execution time: 0.26s
+    // @performance print 'Perform Query :' . (microtime(true) - $started_at) . "s.\n\n";
+    $records = chado_query($sql, $args);
+
+    // @performance evaluate this transaction. Long running transactions can have serious
+    // performance issues in PostgreSQL. One option is to move the transaction within the
+    // loop so that each one is not very long but then we end up with more overhead creating
+    // transactions. A better albeit more complicated approach might be to break the job into
+    // chunks where each one is a single transaction.
+    $transaction = db_transaction();
+
+    // update the job status every chunk start.
+    $complete = ($total_published / $count) * 33.33333333;
+    // Currently don't support setting job progress within a transaction.
+    // if ($report_progress) { $job->setProgress(intval($complete * 3)); }
+    if ($total_published === 0) {
+      printf("%d of %d records. (%0.2f%%) Memory: %s bytes.\r",
+        $i, $count, 0, number_format(memory_get_usage()), 0);
+    }
+    else {
+      printf("%d of %d records. (%0.2f%%) Memory: %s bytes; Current run time: %s minutes.\r",
+        $total_published, $count, $complete * 3, number_format(memory_get_usage()), number_format((microtime(true) - $started_at)/60, 2));
+    }
 
-      // First save the tripal_entity record.
-      // @performace This is likely a bottleneck. Too bad we can't create
-      // multiple entities at once... sort of like the copy method.
-      $record_id = $record->record_id;
-      $ec = entity_get_controller('TripalEntity');
-      $entity = $ec->create(array(
-        'bundle' => $bundle_name,
-        'term_id' => $bundle->term_id,
-        // Add in the Chado details for when the hook_entity_create()
-        // is called and our tripal_chado_entity_create() implementation
-        // can deal with it.
-        // @performance maybe there is something we can easily do here?
-        'chado_record' => chado_generate_var($table, array($pkey_field => $record_id)),
-        'chado_record_id' => $record_id,
-        'publish' => TRUE,
-      ));
-      $entity = $entity->save();
-      if (!$entity) {
-        throw new Exception('Could not create entity.');
-      }
+    try {
+      $i = 0;
+      while($record = $records->fetchObject()) {
+
+        // @performance remove after development
+        // print 'Start current entity :' . (microtime(true) - $started_at) . "s.\n";
+
+        // First save the tripal_entity record.
+        // @performace This is likely a bottleneck. Too bad we can't create
+        // multiple entities at once... sort of like the copy method.
+        $record_id = $record->record_id;
+        $ec = entity_get_controller('TripalEntity');
+        $entity = $ec->create(array(
+          'bundle' => $bundle_name,
+          'term_id' => $bundle->term_id,
+          // Add in the Chado details for when the hook_entity_create()
+          // is called and our tripal_chado_entity_create() implementation
+          // can deal with it.
+          // @performance maybe there is something we can easily do here?
+          'chado_record' => chado_generate_var($table, array($pkey_field => $record_id)),
+          'chado_record_id' => $record_id,
+          'publish' => TRUE,
+        ));
+        $entity = $entity->save();
+        if (!$entity) {
+          throw new Exception('Could not create entity.');
+        }
 
-        // @performance remove after development: this takes 0.2-0.3s.
-        //print 'Create entity itself :' . (microtime(true) - $started_at) . "s.\n";
+          // @performance remove after development: this takes 0.2-0.3s.
+          //print 'Create entity itself :' . (microtime(true) - $started_at) . "s.\n";
 
-      // Next save the chado entity record.
-      $entity_record = array(
-        'entity_id' => $entity->id,
-        'record_id' => $record_id,
-      );
+        // Next save the chado entity record.
+        $entity_record = array(
+          'entity_id' => $entity->id,
+          'record_id' => $record_id,
+        );
 
-      // For the Tv2 to Tv3 migration we want to add the nid to the
-      // entity so we can associate the node with the entity.
-      if (property_exists($record, 'nid')) {
-        $entity_record['nid'] = $record->nid;
-      }
-      $result = db_insert($chado_entity_table)
-        ->fields($entity_record)
-        ->execute();
-      if(!$result){
-        throw new Exception('Could not create mapping of entity to Chado record.');
-      }
+        // For the Tv2 to Tv3 migration we want to add the nid to the
+        // entity so we can associate the node with the entity.
+        if (property_exists($record, 'nid')) {
+          $entity_record['nid'] = $record->nid;
+        }
+        $result = db_insert($chado_entity_table)
+          ->fields($entity_record)
+          ->execute();
+        if(!$result){
+          throw new Exception('Could not create mapping of entity to Chado record.');
+        }
 
-      // @performance remove after development: this takes <0.001s.
-      // print 'Relate back to chado :' . (microtime(true) - $started_at) . "s.\n";
+        // @performance remove after development: this takes <0.001s.
+        // print 'Relate back to chado :' . (microtime(true) - $started_at) . "s.\n";
 
-      $i++;
+        $i++;
+        $total_published++;
+      }
     }
+    catch (Exception $e) {
+      $transaction->rollback();
+      $error = $e->getMessage();
+      tripal_report_error('tripal_chado', TRIPAL_ERROR, "Could not publish record: @error", array('@error' => $error));
+      drupal_set_message('Failed publishing record. See recent logs for more details.', 'error');
+      return FALSE;
+    }
+
+    // If we get through the loop and haven't completed 100 records, then we're done!
+    if ($i < $chunk_size) {
+      $more_records_to_publish = FALSE;
+    }
+
+    // Commit our current chunk.
+    unset($transaction);
   }
-  catch (Exception $e) {
-    $transaction->rollback();
-    $error = $e->getMessage();
-    tripal_report_error('tripal_chado', TRIPAL_ERROR, "Could not publish record: @error", array('@error' => $error));
-    drupal_set_message('Failed publishing record. See recent logs for more details.', 'error');
-    return FALSE;
-  }
+
   drupal_set_message("Succesfully published $i " . $bundle->label . " record(s).");
     // @performance remove after development
-  print 'Complete :' . (microtime(true) - $started_at) . "s.\n";
+  print 'Complete! Runtime:' . number_format(microtime(true) - $started_at) . " seconds.\n";
   return TRUE;
 }