Browse Source

Pubmed importer fix for 3/second request limit by NCBI API

dsenalik 5 years ago
parent
commit
07b54c77e1

+ 2 - 1
tripal_chado/api/modules/tripal_chado.pub.api.inc

@@ -548,11 +548,12 @@ function chado_execute_pub_importer($import_id, $publish = TRUE,
         ['%num' => $num_pubs], $message_opts);
 
       $subset_report = tripal_pub_add_publications($pubs, $import->do_contact, $do_update, $job);
+      $countpubs = count($pubs);  // the following merge resets count($pubs) so save it
       foreach ($subset_report as $action => $pubs) {
         $report[$action] = array_merge($report[$action], $pubs);
       }
       $page++;
-    } while (count($pubs) == $num_to_retrieve);
+    } while ($countpubs == $num_to_retrieve);
 
     // Publish as requested by the caller.
     _chado_execute_pub_importer_publish($publish, $job, $message_type, $message_opts);

+ 6 - 4
tripal_chado/includes/loaders/tripal_chado.pub_importer_PMID.inc

@@ -208,12 +208,13 @@ function tripal_pub_PMID_search_init($search_str, $retmax) {
   // do a search for a single result so that we can establish a history, and get
   // the number of records. Once we have the number of records we can retrieve
   // those requested in the range.
-  $query_url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?" .
+  $query_url = "https://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?" .
     "db=Pubmed" .
     "&retmax=$retmax" .
     "&usehistory=y" .
     "&term=" . urlencode($search_str);
 
+  usleep(333334);  // 1/3 of a second delay, NCBI limits requests to 3 / second without API key
   $rfh = fopen($query_url, "r");
   if (!$rfh) {
     drupal_set_message('Could not perform Pubmed query. Cannot connect to Entrez.', 'error');
@@ -292,7 +293,7 @@ function tripal_pub_PMID_fetch($query_key, $web_env, $rettype = 'null',
 
   // repeat the search performed previously (using WebEnv & QueryKey) to retrieve
   // the PMID's within the range specied.  The PMIDs will be returned as a text list
-  $fetch_url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" .
+  $fetch_url = "https://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" .
     "rettype=$rettype" .
     "&retmode=$retmod" .
     "&retstart=$start" .
@@ -313,6 +314,7 @@ function tripal_pub_PMID_fetch($query_key, $web_env, $rettype = 'null',
       $fetch_url .= "&$key=$value";
     }
   }
+  usleep(333334);  // 1/3 of a second delay, NCBI limits requests to 3 / second without API key
   $rfh = fopen($fetch_url, "r");
   if (!$rfh) {
     drupal_set_message('ERROR: Could not perform PubMed query.', 'error');
@@ -338,10 +340,10 @@ function tripal_pub_PMID_fetch($query_key, $web_env, $rettype = 'null',
  * XML should contain only a single publication record.
  *
  * Information about the valid elements in the PubMed XML can be found here:
- * http://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html
+ * https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html
  *
  * Information about PubMed's citation format can be found here
- * http://www.nlm.nih.gov/bsd/policy/cit_format.html
+ * https://www.nlm.nih.gov/bsd/policy/cit_format.html
  *
  * @param $pub_xml
  *  An XML string describing a single publication