Prechádzať zdrojové kódy

Merge pull request #536 from tripal/530-publish

Optimization of Publishing Entities
Stephen Ficklin 6 rokov pred
rodič
commit
b8d1ed3d0b

+ 21 - 1
tripal/includes/TripalEntity.inc

@@ -15,4 +15,24 @@ class TripalEntity extends Entity {
     return array('path' => 'TripalEntity/' . $this->id);
   }
 
-}
+  /**
+   * Permanently saves the entity.
+   *
+   * @param $cache
+   *   This array is used to store objects you want to cache for performance reasons,
+   *   as well as, cache related options. The following are supported:
+   *   - boolean $clear_cached_fields
+   *       Clearing cached fields is NECESSARY. IF you choose to set this to false then YOU
+   *       must clear the cache yourself using cache_clear_all('field:TripalEntity:[entity_id]', 'cache_field', TRUE).
+   *       The only known reason to set this to FALSE is to clear the cache in bulk for perfomance reasons.
+   *   - TripalBundle $bundle
+   *       The bundle for the current entity.
+   *   - TripalTerm $term
+   *       The term for the current entity.
+   * @see entity_save()
+   */
+  public function save($cache = array()) {
+    return entity_get_controller($this->entityType)->save($this, $cache);
+  }
+
+}

+ 93 - 40
tripal/includes/TripalEntityController.inc

@@ -45,7 +45,12 @@ class TripalEntityController extends EntityAPIController {
     $modules = module_implements('entity_create');
     foreach ($modules as $module) {
       $function = $module . '_entity_create';
-      $function($entity, $values['type']);
+      if (isset($values['bundle_object'])) {
+        $function($entity, $values['type'], $values['bundle_object']);
+      }
+      else {
+        $function($entity, $values['type']);
+      }
     }
     return $entity;
 
@@ -102,20 +107,30 @@ class TripalEntityController extends EntityAPIController {
    *   The entity whose title should be changed.
    * @param $title
    *   The title to use. It can contain tokens the correspond to field values.
-   *   Token should be be compatible with those returned by 
+   *   Token should be be compatible with those returned by
    *   tripal_get_entity_tokens().
+   * @param $cache
+   *   This array is used to store objects you want to cache for performance reasons,
+   *   as well as, cache related options. The following are supported:
+   *   - TripalBundle $bundle
+   *       The bundle for the current entity.
    */
-  public function setTitle($entity, $title = NULL) {
-    
-    $bundle = tripal_load_bundle_entity(array('name' => $entity->bundle));
-    
+  public function setTitle($entity, $title = NULL, $cache = array()) {
+
+    if (isset($cache['bundle'])) {
+      $bundle = $cache['bundle'];
+    }
+    else {
+      $bundle = tripal_load_bundle_entity(array('name' => $entity->bundle));
+    }
+
     // If no title was supplied then we should try to generate one using the
     // default format set by admins.
-    if (!$title) {     
+    if (!$title) {
       $title = tripal_get_title_format($bundle);
     }
     $title = tripal_replace_entity_tokens($title, $entity, $bundle);
-    
+
     if ($title) {
       db_update('tripal_entity')
         ->fields(array(
@@ -128,25 +143,37 @@ class TripalEntityController extends EntityAPIController {
 
   /**
    * Sets the URL alias for an entity.
-   * 
+   *
    * @param $entity
    *   The entity whose URL alias should be changed.
    * @param $alias
    *   The alias to use. It can contain tokens the correspond to field values.
-   *   Token should be be compatible with those returned by 
+   *   Token should be be compatible with those returned by
    *   tripal_get_entity_tokens().
+   * @param $cache
+   *   This array is used to store objects you want to cache for performance reasons,
+   *   as well as, cache related options. The following are supported:
+   *   - TripalBundle $bundle
+   *       The bundle for the current entity.
+   *   - TripalTerm $term
+   *       The term for the current entity.
    */
-  public function setAlias($entity, $alias = NULL) {
+  public function setAlias($entity, $alias = NULL, $cache = array()) {
     $source_url = "bio_data/$entity->id";
 
     // If no alias was supplied then we should try to generate one using the
     // default format set by admins.
     if (!$alias) {
 
-      // Load the TripalBundle entity for this TripalEntity.
+      // Load the TripalBundle entity for this TripalEntity (if it's not cached).
       // First get the format for the url alias based on the bundle of the entity.
       // Then replace all the tokens with values from the entity fields.
-      $bundle_entity = tripal_load_bundle_entity(array('name' => $entity->bundle));
+      if (isset($cache['bundle'])) {
+        $bundle_entity = $cache['bundle'];
+      }
+      else {
+        $bundle_entity = tripal_load_bundle_entity(array('name' => $entity->bundle));
+      }
       $alias = tripal_get_bundle_variable('url_format', $bundle_entity->id);
       $alias = tripal_replace_entity_tokens($alias, $entity, $bundle_entity);
     }
@@ -155,20 +182,25 @@ class TripalEntityController extends EntityAPIController {
     // the term name and entity id.
     if (!$alias) {
 
-      // Load the term for this TripalEntity. Set a default based on the term 
-      // name and entity id. Then replace all the tokens with values from 
+      // Load the term for this TripalEntity. Set a default based on the term
+      // name and entity id. Then replace all the tokens with values from
       // the entity fields.
-      $term = entity_load('TripalTerm', array('id' => $entity->term_id));
+      $term = (isset($cache['term'])) ? $cache['term'] : entity_load('TripalTerm', array('id' => $entity->term_id));
       $term = reset($term);
       $alias = str_replace(' ', '', $term->name) . '/[TripalEntity__entity_id]';
       $alias = tripal_replace_entity_tokens($alias, $entity, $bundle_entity);
     }
-    
-    // Check if the passed alias has tokens. Load the TripalBundle entity for 
-    // this TripalEntity. Then replace all the tokens with values from the 
+
+    // Check if the passed alias has tokens. Load the TripalBundle entity for
+    // this TripalEntity. Then replace all the tokens with values from the
     // entity fields.
     if($alias && (preg_match_all("/\[[^\]]*\]/", $alias, $bundle_tokens))) {
-      $bundle_entity = tripal_load_bundle_entity(array('name' => $entity->bundle));
+      if (isset($cache['bundle'])) {
+        $bundle_entity = $cache['bundle'];
+      }
+      else {
+        $bundle_entity = tripal_load_bundle_entity(array('name' => $entity->bundle));
+      }
       $alias = tripal_replace_entity_tokens($alias, $entity, $bundle_entity);
     }
 
@@ -193,11 +225,13 @@ class TripalEntityController extends EntityAPIController {
         // First delete any previous alias' for this entity.
         // Then save the new one.
 
-        // TODO: publishing an entity can be very slow if there are lots of
+        // @performance: Look into this further.
+        // @spficklin publishing an entity can be very slow if there are lots of
         // entries in the url_alias table, due to this type of
-        // SQL statement that gets called somewhere by Drupal:
+        // SQL statement that gets called in drupal_path_alias_whitelist_rebuild():
         // SELECT DISTINCT SUBSTRING_INDEX(source, '/', 1) AS path FROM url_alias.
         // Perhaps we should write our own SQL to avoid this issue.
+        // @lacey: drupal_path_alias_whitelist_rebuild() isn't getting called for me during publish.
         $values =  array(
           'source' => $source_url,
           'alias' => $alias,
@@ -243,11 +277,16 @@ class TripalEntityController extends EntityAPIController {
           drupal_write_record('url_alias', $values);
         }
       }
-      // If there is only one alias matching then it might just be that we 
+      // If there is only one alias matching then it might just be that we
       // already assigned this alias to this entity in a previous save.
       elseif ($num_aliases == 1) {
 
-        $bundle_entity = tripal_load_bundle_entity(array('name' => $entity->bundle));
+        if (isset($cache['bundle'])) {
+          $bundle_entity = $cache['bundle'];
+        }
+        else {
+          $bundle_entity = tripal_load_bundle_entity(array('name' => $entity->bundle));
+        }
 
         // Check to see if the single alias is for the same entity and if not
         // warn the admin that the alias is already used (ie: not unique?)
@@ -275,7 +314,12 @@ class TripalEntityController extends EntityAPIController {
       // If there are more then one alias' matching what we generated then there's
       // a real problem and we need to warn the administrator.
       else {
-        $bundle_entity = tripal_load_bundle_entity(array('name' => $entity->bundle));
+        if (isset($cache['bundle'])) {
+          $bundle_entity = $cache['bundle'];
+        }
+        else {
+          $bundle_entity = tripal_load_bundle_entity(array('name' => $entity->bundle));
+        }
 
         $aliases = db_query('SELECT source FROM {url_alias} WHERE alias=:alias',
           array(':alias' => $alias))->fetchAll();
@@ -308,19 +352,26 @@ class TripalEntityController extends EntityAPIController {
    *
    * @param $entity
    *   A TripalEntity object to save.
+   * @param $cache
+   *   This array is used to store objects you want to cache for performance reasons,
+   *   as well as, cache related options. The following are supported:
+   *   - boolean $clear_cached_fields
+   *       Clearing cached fields is NECESSARY. IF you choose to set this to false then YOU
+   *       must clear the cache yourself using cache_clear_all('field:TripalEntity:[entity_id]', 'cache_field', TRUE).
+   *       The only known reason to set this to FALSE is to clear the cache in bulk for perfomance reasons.
+   *   - TripalBundle $bundle
+   *       The bundle for the current entity.
+   *   - TripalTerm $term
+   *       The term for the current entity.
    *
    * @return
    *   The saved entity object with updated properties.
    */
-  public function save($entity, DatabaseTransaction $transaction = NULL) {
+  public function save($entity, $cache = array()) {
     global $user;
     $pkeys = array();
 
-    // Get the author information.
-    $author = $user;
-    if (property_exists($entity, 'uid')) {
-      $author = user_load($entity->uid);
-    }
+    if (!isset($cache['clear_cached_fields'])) $cache['clear_cached_fields'] = TRUE;
 
     $changed_date = time();
     $create_date = $changed_date;
@@ -338,12 +389,12 @@ class TripalEntityController extends EntityAPIController {
       }
     }
 
-    $transaction = isset($transaction) ? $transaction : db_transaction();
+    $transaction = db_transaction();
     try {
       // If our entity has no id, then we need to give it a
       // time of creation.
       if (empty($entity->id)) {
-        $entity->created = time();
+        $entity->created = $created_date;
         $invocation = 'entity_insert';
       }
       else {
@@ -363,7 +414,7 @@ class TripalEntityController extends EntityAPIController {
         'type'      => $entity->type,
         'bundle'    => $entity->bundle,
         'title'     => $entity->title,
-        'uid'       => $author->uid,
+        'uid'       => $entity->uid,
         'created'   => $create_date,
         'changed'   => $changed_date,
         'status'    => $status,
@@ -394,10 +445,10 @@ class TripalEntityController extends EntityAPIController {
       }
 
       // Set the title for this entity.
-      $this->setTitle($entity);
+      $this->setTitle($entity, NULL, $cache);
 
       // Set the path/url alias for this entity.
-      $this->setAlias($entity);
+      $this->setAlias($entity, NULL, $cache);
 
       // Invoke either hook_entity_update() or hook_entity_insert().
       module_invoke_all('entity_postsave', $entity, $entity->type);
@@ -405,8 +456,12 @@ class TripalEntityController extends EntityAPIController {
 
       // Clear any cache entries for this entity so it can be reloaded using
       // the values that were just saved.
-      $cid = 'field:TripalEntity:' . $entity->id;
-      cache_clear_all($cid, 'cache_field', TRUE);
+      // Also, we don't need to clear cached fields when publishing because we
+      // didn't attach any (see above).
+      if ($cache['clear_cached_fields'] AND ($invocation != 'entity_publish')) {
+        $cid = 'field:TripalEntity:' . $entity->id;
+        cache_clear_all($cid, 'cache_field', TRUE);
+      }
 
       return $entity;
     }
@@ -416,8 +471,6 @@ class TripalEntityController extends EntityAPIController {
       drupal_set_message("Could not save the entity: " . $e->getMessage(), "error");
       return FALSE;
     }
-
-
   }
 
   /**

+ 116 - 72
tripal_chado/api/tripal_chado.api.inc

@@ -4,7 +4,7 @@
  * @file
  *
  * This file contains miscellaneous API functions specific to working with
- * records in Chado that do not have a home in any other sub category of 
+ * records in Chado that do not have a home in any other sub category of
  * API functions.
  */
 
@@ -12,9 +12,9 @@
  * @defgroup tripal_chado_api Chado
  *
  * @ingroup tripal_api
- * The Tripal Chado API is a set of functions for interacting with data 
+ * The Tripal Chado API is a set of functions for interacting with data
  * inside of a Chado relational database. Entities (or pages) in Drupal
- * that are provided by Tripal can supply data from any supported database 
+ * that are provided by Tripal can supply data from any supported database
  * back-end, and Chado is the default. This API contains a variety of sub
  * categories (or groups) where functions are organized.  Any extension module
  * that desires to work with data in Chado will find these functions useful.
@@ -39,6 +39,9 @@
  */
 function chado_publish_records($values, $job_id = NULL) {
 
+  // Used for adding runtime to the progress report.
+  $started_at = microtime(true);
+
   // We want the job object in order to report progress.
   if (is_object($job_id)) {
     $job = $job_id;
@@ -52,6 +55,9 @@ function chado_publish_records($values, $job_id = NULL) {
     $report_progress = TRUE;
   }
 
+  // Start an array for caching objects to save performance.
+  $cache = array();
+
   // Make sure we have the required options: bundle_name.
   if (!array_key_exists('bundle_name', $values) or !$values['bundle_name']) {
     tripal_report_error('tripal_chado', TRIPAL_ERROR,
@@ -65,9 +71,15 @@ function chado_publish_records($values, $job_id = NULL) {
   $filters = array_key_exists('filters', $values) ? $values['filters'] : array();
   $sync_node = array_key_exists('sync_node', $values) ? $values['sync_node'] : '';
 
+  // We want to break the number of records to publish into chunks in order to ensure
+  // transactions do not run for too long (performance issue). The number of records
+  // to be processed per chunk is set here:
+  $chunk_size = 500;
+
   // Load the bundle entity so we can get information about which Chado
   // table/field this entity belongs to.
   $bundle = tripal_load_bundle_entity(array('name' => $bundle_name));
+  $cache['bundle'] = $bundle;
   if (!$bundle) {
     tripal_report_error('tripal_chado', TRIPAL_ERROR,
         "Unknown bundle. Could not publish record: @error",
@@ -76,7 +88,6 @@ function chado_publish_records($values, $job_id = NULL) {
   }
   $chado_entity_table = chado_get_bundle_entity_table($bundle);
 
-
   // Get the mapping of the bio data type to the Chado table.
   $chado_bundle = db_select('chado_bundle', 'cb')
     ->fields('cb')
@@ -89,6 +100,10 @@ function chado_publish_records($values, $job_id = NULL) {
     return FALSE;
   }
 
+  // Load the term for use in setting the alias for each entity created.
+  $term = entity_load('TripalTerm', array('id' => $entity->term_id));
+  $cache['term'] = $term;
+
   $table = $chado_bundle->data_table;
   $type_column = $chado_bundle->type_column;
   $type_linker_table = $chado_bundle->type_linker_table;
@@ -181,86 +196,115 @@ function chado_publish_records($values, $job_id = NULL) {
       }
     }
   }
+
   // First get the count
+  // @performance optimize, estimate or remove this. It's only used for reporting progress on the command-line.
   $sql = "SELECT count(*) as num_records " . $from . $where;
   $result = chado_query($sql, $args);
   $count = $result->fetchField();
+  print "\nThere are $count records to publish.\n";
 
-  // calculate the interval for updates
-  $interval = intval($count / 50);
-  if ($interval < 1) {
-    $interval = 1;
-  }
+  print "\nNOTE: publishing records is performed using database transactions. If the job fails\n" .
+          "or is terminated prematurely then the current set of $chunk_size is rolled back with\n" .
+          "no changes to the database. Simply re-run the publishing job to publish any remaining\n".
+          "content after fixing the issue that caused the job to fail.\n\n" .
+          "Also, the following progress only updates every $chunk_size records.\n";
 
   // Perform the query.
-  $sql = $select . $from . $where;
-  $records = chado_query($sql, $args);
-  $transaction = db_transaction();
-
-  print "\nNOTE: publishing records is performed using a database transaction. \n" .
-      "If the load fails or is terminated prematurely then the entire set of \n" .
-      "is rolled back with no changes to the database\n\n";
-
-  $i = 0;
-  printf("%d of %d records. (%0.2f%%) Memory: %s bytes\r", $i, $count, 0, number_format(memory_get_usage()));
-  try {
-    while($record = $records->fetchObject()) {
-
-      // update the job status every interval
-      if ($i % $interval == 0) {
-        $complete = ($i / $count) * 33.33333333;
-        // Currently don't support setting job progress within a transaction.
-        // if ($report_progress) { $job->setProgress(intval($complete * 3)); }
-        printf("%d of %d records. (%0.2f%%) Memory: %s bytes\r", $i, $count, $complete * 3, number_format(memory_get_usage()));
-      }
+  $sql = $select . $from . $where . ' LIMIT '.$chunk_size;
+  $more_records_to_publish = TRUE;
+  $total_published = 0;
+  while ($more_records_to_publish) {
+
+    $records = chado_query($sql, $args);
+
+    // Update the job status every chunk start.
+    // Because this is outside of hte transaction, we can update the admin through the jobs UI.
+    $complete = ($total_published / $count) * 33.33333333;
+    if ($report_progress) { $job->setProgress(intval($complete * 3)); }
+    if ($total_published === 0) {
+      printf("%d of %d records. (%0.2f%%) Memory: %s bytes.\r",
+        $i, $count, 0, number_format(memory_get_usage()), 0);
+    }
+    else {
+      printf("%d of %d records. (%0.2f%%) Memory: %s bytes; Current run time: %s minutes.\r",
+        $total_published, $count, $complete * 3, number_format(memory_get_usage()), number_format((microtime(true) - $started_at)/60, 2));
+    }
 
-      // First save the tripal_entity record.
-      $record_id = $record->record_id;
-      $ec = entity_get_controller('TripalEntity');
-      $entity = $ec->create(array(
-        'bundle' => $bundle_name,
-        'term_id' => $bundle->term_id,
-        // Add in the Chado details for when the hook_entity_create()
-        // is called and our tripal_chado_entity_create() implementation
-        // can deal with it.
-        'chado_record' => chado_generate_var($table, array($pkey_field => $record_id)),
-        'chado_record_id' => $record_id,
-        'publish' => TRUE,
-      ));
-      $entity = $entity->save();
-      if (!$entity) {
-        throw new Exception('Could not create entity.');
-      }
+    // There is no need to cache transactions since Drupal handles nested transactions
+    // "by performing no transactional operations (as far as the database sees) within
+    // the inner nesting layers". Effectively, Drupal ensures nested trasactions work the
+    // same as passing a transaction through to the deepest level and not starting a new
+    // transaction if we are already in one.
+    $transaction = db_transaction();
+
+    try {
+      $i = 0;
+      while($record = $records->fetchObject()) {
+
+        // First save the tripal_entity record.
+        // @performace This is likely a bottleneck. Too bad we can't create
+        // multiple entities at once... sort of like the copy method.
+        $record_id = $record->record_id;
+        $ec = entity_get_controller('TripalEntity');
+
+        $entity = $ec->create(array(
+          'bundle' => $bundle_name,
+          'term_id' => $bundle->term_id,
+          // Add in the Chado details for when the hook_entity_create()
+          // is called and our tripal_chado_entity_create() implementation
+          // can deal with it.
+          'chado_record' => chado_generate_var($table, array($pkey_field => $record_id), array('include_fk' => 0)),
+          'chado_record_id' => $record_id,
+          'publish' => TRUE,
+          'bundle_object' => $bundle,
+        ));
+
+        $entity = $entity->save($cache);
+        if (!$entity) {
+          throw new Exception('Could not create entity.');
+        }
 
-      // Next save the chado entity record.
-      $entity_record = array(
-        'entity_id' => $entity->id,
-        'record_id' => $record_id,
-      );
+        // Next save the chado entity record.
+        $entity_record = array(
+          'entity_id' => $entity->id,
+          'record_id' => $record_id,
+        );
 
-      // For the Tv2 to Tv3 migration we want to add the nid to the
-      // entity so we can associate the node with the entity.
-      if (property_exists($record, 'nid')) {
-        $entity_record['nid'] = $record->nid;
-      }
-      $result = db_insert($chado_entity_table)
-        ->fields($entity_record)
-        ->execute();
-      if(!$result){
-        throw new Exception('Could not create mapping of entity to Chado record.');
+        // For the Tv2 to Tv3 migration we want to add the nid to the
+        // entity so we can associate the node with the entity.
+        if (property_exists($record, 'nid')) {
+          $entity_record['nid'] = $record->nid;
+        }
+        $result = db_insert($chado_entity_table)
+          ->fields($entity_record)
+          ->execute();
+        if(!$result){
+          throw new Exception('Could not create mapping of entity to Chado record.');
+        }
+
+        $i++;
+        $total_published++;
       }
+    }
+    catch (Exception $e) {
+      $transaction->rollback();
+      $error = $e->getMessage();
+      tripal_report_error('tripal_chado', TRIPAL_ERROR, "Could not publish record: @error", array('@error' => $error));
+      drupal_set_message('Failed publishing record. See recent logs for more details.', 'error');
+      return FALSE;
+    }
 
-      $i++;
+    // If we get through the loop and haven't completed 100 records, then we're done!
+    if ($i < $chunk_size) {
+      $more_records_to_publish = FALSE;
     }
+
+    // Commit our current chunk.
+    unset($transaction);
   }
-  catch (Exception $e) {
-    $transaction->rollback();
-    $error = $e->getMessage();
-    tripal_report_error('tripal_chado', TRIPAL_ERROR, "Could not publish record: @error", array('@error' => $error));
-    drupal_set_message('Failed publishing record. See recent logs for more details.', 'error');
-    return FALSE;
-  }
-  drupal_set_message("Succesfully published $i " . $bundle->label . " record(s).");
+
+  drupal_set_message("Succesfully published $total_published " . $bundle->label . " record(s).");
   return TRUE;
 }
 
@@ -271,7 +315,7 @@ function chado_publish_records($values, $job_id = NULL) {
  *    The name of a base table in Chado.
  * @return
  *    An array of tokens where the key is the machine_name of the token.
- * 
+ *
  * @ingroup tripal_chado_api
  */
 function chado_get_tokens($base_table) {
@@ -329,7 +373,7 @@ function chado_get_tokens($base_table) {
  *
  * @return
  *   The string will all tokens replaced with values.
- * 
+ *
  *  @ingroup tripal_chado_api
  */
 function chado_replace_tokens($string, $record) {

+ 11 - 2
tripal_chado/includes/tripal_chado.entity.inc

@@ -7,8 +7,15 @@
  * This hook is called when brand new entities are created, but
  * they are not loaded so the hook_entity_load() is not yet called. We
  * can use this hook to add properties to the entity before saving.
+ *
+ * @param $entity
+ *   The entity being created.
+ * @param $type
+ *   The type of entity being created.
+ * @param $bundle (OPTIONAL)
+ *   The bundle object for the current entity.
  */
-function tripal_chado_entity_create(&$entity, $type) {
+function tripal_chado_entity_create(&$entity, $type, $bundle = NULL) {
   if ($type == 'TripalEntity') {
 
     // Set some defaults on vars needed by this module.
@@ -18,7 +25,9 @@ function tripal_chado_entity_create(&$entity, $type) {
       $entity->chado_linker = NULL;
 
       // Add in the Chado table information for this entity type.
-      $bundle = tripal_load_bundle_entity(array('name' => $entity->bundle));
+      if (!$bundle) {
+        $bundle = tripal_load_bundle_entity(array('name' => $entity->bundle));
+      }
       if ($bundle->data_table) {
         $entity->chado_table = $bundle->data_table;
         $entity->chado_column = $bundle->type_column;