$total_records, 'search_str' => $search_str, 'pubs' => [], ]; } // now get the list of PMIDs from the initialized search $pmids_txt = tripal_pub_PMID_fetch($query_key, $web_env, 'uilist', 'text', $start, $num_to_retrieve); // iterate through each PMID and get the publication record. This requires a new search and new fetch $pmids = explode("\n", trim($pmids_txt)); $pubs = []; foreach ($pmids as $pmid) { // now retrieve the individual record $pub_xml = tripal_pub_PMID_fetch($query_key, $web_env, 'null', 'xml', 0, 1, ['id' => $pmid]); $pub = tripal_pub_PMID_parse_pubxml($pub_xml); $pubs[] = $pub; } return [ 'total_records' => $total_records, 'search_str' => $search_str, 'pubs' => $pubs, ]; } /** * Initailizes a PubMed Search using a given search string * * @param $search_str * The PubMed Search string * @param $retmax * The maximum number of records to return * * @return * An array containing the Count, WebEnv and QueryKey as return * by PubMed's esearch utility * * @ingroup tripal_pub */ function tripal_pub_PMID_search_init($search_str, $retmax) { // do a search for a single result so that we can establish a history, and get // the number of records. Once we have the number of records we can retrieve // those requested in the range. $query_url = "https://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?" . "db=Pubmed" . "&retmax=$retmax" . "&usehistory=y" . "&term=" . urlencode($search_str); $api_key = variable_get('tripal_pub_importer_ncbi_api_key', NULL); $sleep_time = 333334; if (!empty($api_key)) { $query_url .= "&api_key=" . $api_key; $sleep_time = 100000; } usleep($sleep_time); // 1/3 of a second delay, NCBI limits requests to 3 / second without API key $rfh = fopen($query_url, "r"); if (!$rfh) { drupal_set_message('Could not perform Pubmed query. Cannot connect to Entrez.', 'error'); tripal_report_error('tripal_pubmed', TRIPAL_ERROR, "Could not perform Pubmed query. Cannot connect to Entrez.", []); return 0; } // retrieve the XML results $query_xml = ''; while (!feof($rfh)) { $query_xml .= fread($rfh, 255); } fclose($rfh); $xml = new XMLReader(); $xml->xml($query_xml); // iterate though the child nodes of the tag and get the count, history and query_id $result = []; while ($xml->read()) { $element = $xml->name; if ($xml->nodeType == XMLReader::END_ELEMENT and $element == 'WebEnv') { // we've read as much as we need. If we go too much further our counts // will get messed up by other 'Count' elements. so we're done. break; } if ($xml->nodeType == XMLReader::ELEMENT) { switch ($element) { case 'Count': $xml->read(); $result['Count'] = $xml->value; break; case 'WebEnv': $xml->read(); $result['WebEnv'] = $xml->value; break; case 'QueryKey': $xml->read(); $result['QueryKey'] = $xml->value; break; } } } return $result; } /** * Retrieves from PubMed a set of publications from the * previously initiated query. * * @param $query_key * The esearch QueryKey * @param $web_env * The esearch WebEnv * @param $rettype * The efetch return type * @param $retmod * The efetch return mode * @param $start * The start of the range to retrieve * @param $limit * The number of publications to retrieve * @param $args * Any additional arguments to add the efetch query URL * * @return * An array containing the total_records in the dataaset, the search string * and an array of the publications that were retreived. * * @ingroup tripal_pub */ function tripal_pub_PMID_fetch($query_key, $web_env, $rettype = 'null', $retmod = 'null', $start = 0, $limit = 10, $args = []) { // repeat the search performed previously (using WebEnv & QueryKey) to retrieve // the PMID's within the range specied. The PMIDs will be returned as a text list $fetch_url = "https://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" . "rettype=$rettype" . "&retmode=$retmod" . "&retstart=$start" . "&retmax=$limit" . "&db=Pubmed" . "&query_key=$query_key" . "&WebEnv=$web_env"; $api_key = variable_get('tripal_pub_importer_ncbi_api_key', NULL); $sleep_time = 333334; if (!empty($api_key)) { $fetch_url .= "&api_key=" . $api_key; $sleep_time = 100000; } foreach ($args as $key => $value) { if (is_array($value)) { $fetch_url .= "&$key="; foreach ($value as $item) { $fetch_url .= "$item,"; } $fetch_url = substr($fetch_url, 0, -1); // remove trailing comma } else { $fetch_url .= "&$key=$value"; } } usleep($sleep_time); // 1/3 of a second delay, NCBI limits requests to 3 / second without API key $rfh = fopen($fetch_url, "r"); if (!$rfh) { drupal_set_message('ERROR: Could not perform PubMed query.', 'error'); tripal_report_error('tripal_pubmed', TRIPAL_ERROR, "Could not perform PubMed query: %fetch_url.", ['%fetch_url' => $fetch_url]); return ''; } $results = ''; if ($rfh) { while (!feof($rfh)) { $results .= fread($rfh, 255); } fclose($rfh); } return $results; } /** * This function parses the XML containing details of a publication and * converts it into an associative array of where keys are Tripal Pub * ontology terms and the values are extracted from the XML. The * XML should contain only a single publication record. * * Information about the valid elements in the PubMed XML can be found here: * https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html * * Information about PubMed's citation format can be found here * https://www.nlm.nih.gov/bsd/policy/cit_format.html * * @param $pub_xml * An XML string describing a single publication * * @return * An array describing the publication * * @ingroup tripal_pub */ function tripal_pub_PMID_parse_pubxml($pub_xml) { $pub = []; if (!$pub_xml) { return $pub; } // read the XML and iterate through it. $xml = new XMLReader(); $xml->xml(trim($pub_xml)); while ($xml->read()) { $element = $xml->name; if ($xml->nodeType == XMLReader::ELEMENT) { switch ($element) { case 'ERROR': $xml->read(); // get the value for this element tripal_report_error('tripal_pubmed', TRIPAL_ERROR, "Error: %err", ['%err' => $xml->value]); break; case 'PMID': // thre are multiple places where a PMID is present in the XML and // since this code does not descend into every branch of the XML tree // we will encounter many of them here. Therefore, we only want the // PMID that we first encounter. If we already have the PMID we will // just skip it. Examples of other PMIDs are in the articles that // cite this one. $xml->read(); // get the value for this element if (!array_key_exists('Publication Dbxref', $pub)) { $pub['Publication Dbxref'] = 'PMID:' . $xml->value; } break; case 'Article': $pub_model = $xml->getAttribute('PubModel'); $pub['Publication Model'] = $pub_model; tripal_pub_PMID_parse_article($xml, $pub); break; case 'MedlineJournalInfo': tripal_pub_PMID_parse_medline_journal_info($xml, $pub); break; case 'ChemicalList': // TODO: handle this break; case 'SupplMeshList': // TODO: meant for protocol list break; case 'CitationSubset': // TODO: not sure this is needed. break; case 'CommentsCorrections': // TODO: handle this break; case 'GeneSymbolList': // TODO: handle this break; case 'MeshHeadingList': // TODO: Medical subject headings break; case 'NumberOfReferences': // TODO: not sure we should keep this as it changes frequently. break; case 'PersonalNameSubjectList': // TODO: for works about an individual or with biographical note/obituary. break; case 'OtherID': // TODO: ID's from another NLM partner. break; case 'OtherAbstract': // TODO: when the journal does not contain an abstract for the publication. break; case 'KeywordList': // TODO: handle this break; case 'InvestigatorList': // TODO: personal names of individuals who are not authors (can be used with collection) break; case 'GeneralNote': // TODO: handle this break; case 'DeleteCitation': // TODO: need to know how to handle this break; default: break; } } } $pub['Citation'] = chado_pub_create_citation($pub); $pub['raw'] = $pub_xml; return $pub; } /** * Parses the section from the XML returned from PubMed that contains * information about the Journal * * @param $xml * The XML to parse * @param $pub * The publication object to which additional details will be added * * @ingroup tripal_pub */ function tripal_pub_PMID_parse_medline_journal_info($xml, &$pub) { while ($xml->read()) { // get this element name $element = $xml->name; // if we're at the element then we're done with the article... if ($xml->nodeType == XMLReader::END_ELEMENT and $element == 'MedlineJournalInfo') { return; } if ($xml->nodeType == XMLReader::ELEMENT) { switch ($element) { case 'Country': // the place of publication of the journal $xml->read(); $pub['Journal Country'] = $xml->value; break; case 'MedlineTA': // TODO: not sure how this is different from ISOAbbreviation break; case 'NlmUniqueID': // TODO: the journal's unique ID in medline break; case 'ISSNLinking': // TODO: not sure how this is different from ISSN break; default: break; } } } } /** * Parses the section from the XML returned from PubMed that contains * information about an article. * * @param $xml * The XML to parse * @param $pub * The publication object to which additional details will be added * * @ingroup tripal_pub */ function tripal_pub_PMID_parse_article($xml, &$pub) { while ($xml->read()) { // get this element name $element = $xml->name; // if we're at the element then we're done with the article... if ($xml->nodeType == XMLReader::END_ELEMENT and $element == 'Article') { return; } if ($xml->nodeType == XMLReader::ELEMENT) { switch ($element) { case 'Journal': tripal_pub_PMID_parse_journal($xml, $pub); break; case 'ArticleTitle': $pub['Title'] = $xml->readString(); break; case 'Abstract': tripal_pub_PMID_parse_abstract($xml, $pub); break; case 'Pagination': tripal_pub_PMID_parse_pagination($xml, $pub); break; case 'ELocationID': $type = $xml->getAttribute('EIdType'); $valid = $xml->getAttribute('ValidYN'); $xml->read(); $elocation = $xml->value; if ($type == 'doi' and $valid == 'Y') { $pub['DOI'] = $elocation; } if ($type == 'pii' and $valid == 'Y') { $pub['PII'] = $elocation; } $pub['Elocation'] = $elocation; break; case 'Affiliation': // the affiliation tag at this level is meant solely for the first author $xml->read(); $pub['Author List'][0]['Affiliation'] = $xml->value; break; case 'AuthorList': $complete = $xml->getAttribute('CompleteYN'); tripal_pub_PMID_parse_authorlist($xml, $pub); break; case 'InvestigatorList': // TODO: perhaps handle this one day. The investigator list is to list the names of people who // are members of a collective or corporate group that is an author in the paper. break; case 'Language': $xml->read(); $lang_abbr = $xml->value; // there may be multiple languages so we store these in an array $pub['Language'][] = tripal_pub_remote_search_get_language($lang_abbr); $pub['Language Abbr'][] = $lang_abbr; break; case 'DataBankList': // TODO: handle this case break; case 'GrantList': // TODO: handle this case break; case 'PublicationTypeList': tripal_pub_PMID_parse_publication_type($xml, $pub); break; case 'VernacularTitle': $xml->read(); $pub['Vernacular Title'][] = $xml->value; break; case 'ArticleDate': // TODO: figure out what to do with this element. We already have the // published date in the field, but this date should be in numeric // form and may have more information. break; default: break; } } } } /** * Parses the section from the XML returned from PubMed that contains * information about a publication * * A full list of publication types can be found here: * http://www.nlm.nih.gov/mesh/pubtypes.html. * * The Tripal Pub ontology doesn't yet have terms for all of the * publication types so we store the value in the 'publication_type' term. * * @param $xml * The XML to parse * @param $pub * The publication object to which additional details will be added * * @ingroup tripal_pub */ function tripal_pub_PMID_parse_publication_type($xml, &$pub) { while ($xml->read()) { $element = $xml->name; if ($xml->nodeType == XMLReader::END_ELEMENT and $element == 'PublicationTypeList') { // we've reached the element so we're done. return; } if ($xml->nodeType == XMLReader::ELEMENT) { switch ($element) { case 'PublicationType': $xml->read(); $value = $xml->value; $identifiers = [ 'name' => $value, 'cv_id' => [ 'name' => 'tripal_pub', ], ]; $options = ['case_insensitive_columns' => ['name']]; $pub_cvterm = chado_get_cvterm($identifiers, $options); if (!$pub_cvterm) { // see if this we can find the name using a synonym $identifiers = [ 'synonym' => [ 'name' => $value, 'cv_name' => 'tripal_pub', ], ]; $pub_cvterm = chado_get_cvterm($identifiers, $options); if (!$pub_cvterm) { tripal_report_error('tripal_pubmed', TRIPAL_ERROR, 'Cannot find a valid vocabulary term for the publication type: "%term".', ['%term' => $value]); } } else { $pub['Publication Type'][] = $pub_cvterm->name; } break; default: break; } } } } /** * Parses the section from the XML returned from PubMed that contains * information about the abstract * * @param $xml * The XML to parse * @param $pub * The publication object to which additional details will be added * * @ingroup tripal_pub */ function tripal_pub_PMID_parse_abstract($xml, &$pub) { $abstract = ''; while ($xml->read()) { $element = $xml->name; if ($xml->nodeType == XMLReader::END_ELEMENT and $element == 'Abstract') { // we've reached the element so return $pub['Abstract'] = $abstract; return; } // the abstract text can be just a singe paragraph or be broken into multiple // abstract texts for structured abstracts. Here we will just combine then // into a single element in the order that they arrive in HTML format if ($xml->nodeType == XMLReader::ELEMENT) { switch ($element) { case 'AbstractText': $label = $xml->getAttribute('Label'); $value = $xml->readString(); if ($label) { $part = "

$label
" . $value . '

'; $abstract .= $part; $pub['Structured Abstract Part'][] = $part; } else { $abstract .= "

" . $value . "

"; } break; case 'CopyrightInformation': $xml->read(); $pub['Copyright'] = $xml->value; break; default: break; } } } } /** * Parses the section from the XML returned from PubMed that contains * information about pagination * * @param $xml * The XML to parse * @param $pub * The publication object to which additional details will be added * * @ingroup tripal_pub */ function tripal_pub_PMID_parse_pagination($xml, &$pub) { while ($xml->read()) { $element = $xml->name; if ($xml->nodeType == XMLReader::END_ELEMENT and $element == 'Pagination') { // we've reached the element so we're done. return; } if ($xml->nodeType == XMLReader::ELEMENT) { switch ($element) { case 'MedlinePgn': $xml->read(); if (trim($xml->value)) { $pub['Pages'] = $xml->value; } break; default: break; } } } } /** * Parses the section from the XML returned from PubMed that contains * information about a journal * * @param $xml * The XML to parse * @param $pub * The publication object to which additional details will be added * * @ingroup tripal_pub */ function tripal_pub_PMID_parse_journal($xml, &$pub) { while ($xml->read()) { $element = $xml->name; if ($xml->nodeType == XMLReader::END_ELEMENT and $element == 'Journal') { return; } if ($xml->nodeType == XMLReader::ELEMENT) { switch ($element) { case 'ISSN': $issn_type = $xml->getAttribute('IssnType'); $xml->read(); $issn = $xml->value; $pub['ISSN'] = $issn; if ($issn_type == 'Electronic') { $pub['eISSN'] = $issn; } if ($issn_type == 'Print') { $pub['pISSN'] = $issn; } break; case 'JournalIssue': // valid values of cited_medium are 'Internet' and 'Print' $cited_medium = $xml->getAttribute('CitedMedium'); tripal_pub_PMID_parse_journal_issue($xml, $pub); break; case 'Title': $xml->read(); $pub['Journal Name'] = $xml->value; break; case 'ISOAbbreviation': $xml->read(); $pub['Journal Abbreviation'] = $xml->value; break; default: break; } } } } /** * Parses the section from the XML returned from PubMed that contains * information about a journal issue * * @param $xml * The XML to parse * @param $pub * The publication object to which additional details will be added * * @ingroup tripal_pub */ function tripal_pub_PMID_parse_journal_issue($xml, &$pub) { while ($xml->read()) { $element = $xml->name; if ($xml->nodeType == XMLReader::END_ELEMENT and $element == 'JournalIssue') { // if we're at the element then we're done return; } if ($xml->nodeType == XMLReader::ELEMENT) { switch ($element) { case 'Volume': $xml->read(); $pub['Volume'] = $xml->value; break; case 'Issue': $xml->read(); $pub['Issue'] = $xml->value; break; case 'PubDate': $date = tripal_pub_PMID_parse_date($xml, 'PubDate'); $year = $date['year']; $month = array_key_exists('month', $date) ? $date['month'] : ''; $day = array_key_exists('day', $date) ? $date['day'] : ''; $medline = array_key_exists('medline', $date) ? $date['medline'] : ''; $pub['Year'] = $year; if ($month and $day and $year) { $pub['Publication Date'] = "$year $month $day"; } elseif ($month and !$day and $year) { $pub['Publication Date'] = "$year $month"; } elseif (!$month and !$day and $year) { $pub['Publication Date'] = $year; } elseif ($medline) { $pub['Publication Date'] = $medline; } else { $pub['Publication Date'] = "Date Unknown"; } break; default: break; } } } } /** * Parses the section from the XML returned from PubMed that contains * information regarding to dates * * @param $xml * The XML to parse * @param $pub * The publication object to which additional details will be added * * @ingroup tripal_pub */ function tripal_pub_PMID_parse_date($xml, $element_name) { $date = []; while ($xml->read()) { $element = $xml->name; if ($xml->nodeType == XMLReader::END_ELEMENT and $element == $element_name) { // if we're at the then we're done return $date; } if ($xml->nodeType == XMLReader::ELEMENT) { switch ($element) { case 'Year': $xml->read(); $date['year'] = $xml->value; break; case 'Month': $xml->read(); $month = $date['month'] = $xml->value; break; case 'Day': $xml->read(); $date['day'] = $xml->value; break; case 'MedlineDate': // the medline date is when the date cannot be broken into distinct month day year. $xml->read(); $date['year'] = preg_replace('/^(\d{4}).*$/', '\1', $xml->value); $date['medline'] = $xml->value; break; default: break; } } } } /** * Parses the section from the XML returned from PubMed that contains * information about the author list for a publication * * @param $xml * The XML to parse * @param $pub * The publication object to which additional details will be added * * @ingroup tripal_pub */ function tripal_pub_PMID_parse_authorlist($xml, &$pub) { $num_authors = 0; while ($xml->read()) { $element = $xml->name; if ($xml->nodeType == XMLReader::END_ELEMENT) { // if we're at the element then we're done with the article... if ($element == 'AuthorList') { // build the author list before returning $authors = ''; foreach ($pub['Author List'] as $author) { if ($author['valid'] == 'N') { // skip non-valid entries. A non-valid entry should have // a corresponding corrected entry so we can saftely skip it. continue; } if (array_key_exists('Collective', $author)) { $authors .= $author['Collective'] . ', '; } else { $authors .= $author['Surname'] . ' ' . $author['First Initials'] . ', '; } } $authors = substr($authors, 0, -2); $pub['Authors'] = $authors; return; } // if we're at the end element then we're done with the author and we can // start a new one. if ($element == 'Author') { $num_authors++; } } if ($xml->nodeType == XMLReader::ELEMENT) { switch ($element) { case 'Author': $valid = $xml->getAttribute('ValidYN'); $pub['Author List'][$num_authors]['valid'] = $valid; break; case 'LastName': $xml->read(); $pub['Author List'][$num_authors]['Surname'] = $xml->value; break; case 'ForeName': $xml->read(); $pub['Author List'][$num_authors]['Given Name'] = $xml->value; break; case 'Initials': $xml->read(); $pub['Author List'][$num_authors]['First Initials'] = $xml->value; break; case 'Suffix': $xml->read(); $pub['Author List'][$num_authors]['Suffix'] = $xml->value; break; case 'CollectiveName': $xml->read(); $pub['Author List'][$num_authors]['Collective'] = $xml->value; break; case 'Identifier': // according to the specification, this element is not yet used. break; default: break; } } } } /** * Get the name of the language based on an abbreviation * * Language abbreviations were obtained here: * http://www.nlm.nih.gov/bsd/language_table.html * * @param $lang_abbr * The abbreviation of the language to return * * @return * The full name of the language * * @ingroup tripal_pub */ function tripal_pub_remote_search_get_language($lang_abbr) { $languages = [ 'afr' => 'Afrikaans', 'alb' => 'Albanian', 'amh' => 'Amharic', 'ara' => 'Arabic', 'arm' => 'Armenian', 'aze' => 'Azerbaijani', 'ben' => 'Bengali', 'bos' => 'Bosnian', 'bul' => 'Bulgarian', 'cat' => 'Catalan', 'chi' => 'Chinese', 'cze' => 'Czech', 'dan' => 'Danish', 'dut' => 'Dutch', 'eng' => 'English', 'epo' => 'Esperanto', 'est' => 'Estonian', 'fin' => 'Finnish', 'fre' => 'French', 'geo' => 'Georgian', 'ger' => 'German', 'gla' => 'Scottish Gaelic', 'gre' => 'Greek, Modern', 'heb' => 'Hebrew', 'hin' => 'Hindi', 'hrv' => 'Croatian', 'hun' => 'Hungarian', 'ice' => 'Icelandic', 'ind' => 'Indonesian', 'ita' => 'Italian', 'jpn' => 'Japanese', 'kin' => 'Kinyarwanda', 'kor' => 'Korean', 'lat' => 'Latin', 'lav' => 'Latvian', 'lit' => 'Lithuanian', 'mac' => 'Macedonian', 'mal' => 'Malayalam', 'mao' => 'Maori', 'may' => 'Malay', 'mul' => 'Multiple languages', 'nor' => 'Norwegian', 'per' => 'Persian', 'pol' => 'Polish', 'por' => 'Portuguese', 'pus' => 'Pushto', 'rum' => 'Romanian, Rumanian, Moldovan', 'rus' => 'Russian', 'san' => 'Sanskrit', 'slo' => 'Slovak', 'slv' => 'Slovenian', 'spa' => 'Spanish', 'srp' => 'Serbian', 'swe' => 'Swedish', 'tha' => 'Thai', 'tur' => 'Turkish', 'ukr' => 'Ukrainian', 'und' => 'Undetermined', 'urd' => 'Urdu', 'vie' => 'Vietnamese', 'wel' => 'Welsh', ]; return $languages[strtolower($lang_abbr)]; }