tripal_chado.pub_importer_AGL.inc 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114
  1. <?php
  2. /**
  3. * @file
  4. * This file provides support for importing and parsing of results from the
  5. * USDA National Agricultural Library (AGL) database. The functions here are
  6. * used by both the publication importer setup form and the publication
  7. * importer. The USDA AGL database uses a YAZ protocol for querying and
  8. * retrieving records.
  9. *
  10. */
  11. /**
  12. * A hook for altering the publication importer form. It Changes the
  13. * 'Days' element to 'Year' and removes the 'Journal Name' filter.
  14. *
  15. * @param $form
  16. * The Drupal form array
  17. * @param $form_state
  18. * The form state array
  19. * @param $num_criteria
  20. * The number of criteria the user currently has added to the form
  21. *
  22. * @return
  23. * The form (drupal form api)
  24. *
  25. * @ingroup tripal_pub
  26. */
  27. function tripal_pub_remote_alter_form_AGL($form, $form_state, $num_criteria = 1) {
  28. // So far we haven't been able to get AGL to filter results to only
  29. // include pubs by the XX number days in the past. So, we will
  30. // change the 'days' element to be the year to query
  31. $form['themed_element']['days']['#title'] = t('Earliest year of publication');
  32. $form['themed_element']['days']['#description'] = t('Filter returned publications for those that have been published no earlier than this year.');
  33. // The Journal Name filter doesn't seem to work, so remove it
  34. for ($i = 1; $i <= $num_criteria; $i++) {
  35. unset($form['themed_element']['criteria'][$i]["scope-$i"]['#options']['journal']);
  36. }
  37. return $form;
  38. }
  39. /**
  40. * A hook for providing additional validation of importer setup form.
  41. *
  42. * @param $form
  43. * The Drupal form array
  44. * @param $form_state
  45. * The form state array
  46. *
  47. * @return
  48. * The form (drupal form api)
  49. *
  50. * @ingroup tripal_pub
  51. */
  52. function tripal_pub_remote_validate_form_AGL($form, $form_state) {
  53. $days = trim($form_state['values']["days"]);
  54. $latestyear = trim($form_state['values']["latestyear"]);
  55. $num_criteria = $form_state['values']['num_criteria'];
  56. if ($days and !preg_match('/^\d\d\d\d$/', $days)) {
  57. form_set_error("days", "Please enter a four digit year.");
  58. }
  59. if ($latestyear and !preg_match('/^\d\d\d\d$/', $latestyear)) {
  60. form_set_error("latestyear", "Please enter a four digit year.");
  61. }
  62. $num_ids = 0;
  63. for ($i = 1; $i <= $num_criteria; $i++) {
  64. $search_terms = trim($form_state['values']["search_terms-$i"]);
  65. $scope = $form_state['values']["scope-$i"];
  66. if ($scope == 'id' and !preg_match('/^AGL:\d+$/', $search_terms)) {
  67. form_set_error("search_terms-$i", "The AGL accession be a numeric value, prefixed with 'AGL:' (e.g. AGL:3890740).");
  68. }
  69. if ($scope == 'id') {
  70. $num_ids++;
  71. }
  72. if ($num_ids > 1) {
  73. form_set_error("search_terms-$i", "Unfortuantely, the AGL importer can only support a single accession at a time. Please remove the others.");
  74. }
  75. }
  76. return $form;
  77. }
  78. /**
  79. * A hook for performing the search on the AGL database.
  80. *
  81. * @param $search_array
  82. * An array containing the serach criteria for the serach
  83. * @param $num_to_retrieve
  84. * Indicates the maximum number of publications to retrieve from the remote
  85. * database
  86. * @param $page
  87. * Indicates the page to retrieve. This corresponds to a paged table, where
  88. * each page has $num_to_retrieve publications.
  89. *
  90. * @return
  91. * An array of publications.
  92. *
  93. * @ingroup tripal_pub
  94. */
  95. function tripal_pub_remote_search_AGL($search_array, $num_to_retrieve, $page) {
  96. // get some values from the search array
  97. $num_criteria = $search_array['num_criteria'];
  98. $days = array_key_exists('days', $search_array) ? $search_array['days'] : '';
  99. $latestyear = array_key_exists('latestyear', $search_array) ? $search_array['latestyear'] : '';
  100. // set some defaults
  101. $search_array['limit'] = $num_to_retrieve;
  102. // To build the CCL search string we want to have a single entry for 'author', 'title', 'abstract'
  103. // or 'id', and also the corresponding 'not for each of those.
  104. // But the search form allows the user to have multiple rows of the same type. So, we will build the
  105. // search string separately for each category and it's negative category (if NOT is selected as the op)
  106. // and at the end we will put them together into a single search string. We need to keep
  107. // track of the first entry of any category because it will not have an op (e.g. 'or' or 'and') but the
  108. // operation will be pushed out to separate the categories. The op for any second or third instance of
  109. // the same category will be included within the search string for the catgory.
  110. $ccl = '';
  111. $title = '';
  112. $author = '';
  113. $abstract = '';
  114. $id = '';
  115. $any = '';
  116. $negate_title = '';
  117. $negate_author = '';
  118. $negate_abstract = '';
  119. $negate_id = '';
  120. $negate_any = '';
  121. $order = [];
  122. $first_abstract = 1;
  123. $first_author = 1;
  124. $first_title = 1;
  125. $first_id = 1;
  126. $first_any = 1;
  127. $first_negate_abstract = 1;
  128. $first_negate_author = 1;
  129. $first_negate_title = 1;
  130. $first_negate_id = 1;
  131. $first_negate_any = 1;
  132. for ($i = 1; $i <= $num_criteria; $i++) {
  133. $search_terms = trim($search_array['criteria'][$i]['search_terms']);
  134. $scope = $search_array['criteria'][$i]['scope'];
  135. $is_phrase = $search_array['criteria'][$i]['is_phrase'];
  136. $op = $search_array['criteria'][$i]['operation'];
  137. if ($op) {
  138. $op = strtolower($op);
  139. }
  140. $search_terms = trim($search_terms);
  141. // if this is not a phrase then make sure the AND and OR are lower-case
  142. if (!$is_phrase) {
  143. $search_terms = preg_replace('/ OR /', ' or ', $search_terms);
  144. $search_terms = preg_replace('/ AND /', ' and ', $search_terms);
  145. }
  146. // else make sure the search terms are surrounded by quotes
  147. else {
  148. $search_terms = "\"$search_terms\"";
  149. }
  150. // if this is a 'not' operation then we want to change it to an
  151. // and
  152. $negate = '';
  153. if ($op == 'not') {
  154. $scope = "negate_$scope";
  155. $op = 'or';
  156. }
  157. $order[] = ['scope' => $scope, 'op' => $op];
  158. // build each category
  159. if ($scope == 'title') {
  160. if ($first_title) {
  161. $title .= "($search_terms) ";
  162. $first_title = 0;
  163. }
  164. else {
  165. $title .= "$op ($search_terms) ";
  166. }
  167. }
  168. if ($scope == 'negate_title') {
  169. if ($first_negate_title) {
  170. $negate_title .= "($search_terms) ";
  171. $first_negate_title = 0;
  172. }
  173. else {
  174. $negate_title .= "$op ($search_terms) ";
  175. }
  176. }
  177. elseif ($scope == 'author') {
  178. if ($first_author) {
  179. $author .= "($search_terms) ";
  180. $first_author = 0;
  181. }
  182. else {
  183. $author .= "$op ($search_terms) ";
  184. }
  185. }
  186. elseif ($scope == 'negate_author') {
  187. if ($first_negate_author) {
  188. $negate_author .= "($search_terms) ";
  189. $first_negate_author = 0;
  190. }
  191. else {
  192. $negate_author .= "$op ($search_terms) ";
  193. }
  194. }
  195. elseif ($scope == 'abstract') {
  196. if ($first_abstract) {
  197. $abstract .= "($search_terms) ";
  198. $first_abstract = 0;
  199. }
  200. else {
  201. $abstract .= "$op ($search_terms) ";
  202. }
  203. }
  204. elseif ($scope == 'negate_abstract') {
  205. if ($first_negate_abstract) {
  206. $negate_abstract .= "($search_terms) ";
  207. $first_negate_abstract = 0;
  208. }
  209. else {
  210. $negate_abstract .= "$op ($search_terms) ";
  211. }
  212. }
  213. elseif ($scope == 'journal') {
  214. if ($first_journal) {
  215. $journal .= "($search_terms) ";
  216. $first_jounral = 0;
  217. }
  218. else {
  219. $journal .= "$op ($search_terms) ";
  220. }
  221. }
  222. elseif ($scope == 'negate_journal') {
  223. if ($first_negate_journal) {
  224. $negate_journal .= "($search_terms) ";
  225. $first_negate_journal = 0;
  226. }
  227. else {
  228. $negate_journal .= "$op ($search_terms) ";
  229. }
  230. }
  231. elseif ($scope == 'id') {
  232. if ($first_id) {
  233. $id .= "(" . preg_replace('/AGL:([^\s]*)/', '$1', $search_terms) . ") ";
  234. $first_id = 0;
  235. }
  236. else {
  237. $id .= "$op (" . preg_replace('/AGL:([^\s]*)/', '$1', $search_terms) . ") ";
  238. }
  239. }
  240. elseif ($scope == 'negate_id') {
  241. if ($first_negate_id) {
  242. $negate_id .= "(" . preg_replace('/AGL:([^\s]*)/', '$1', $search_terms) . ") ";
  243. $first_negate_id = 0;
  244. }
  245. else {
  246. $negate_id .= "$op (" . preg_replace('/AGL:([^\s]*)/', '$1', $search_terms) . ") ";
  247. }
  248. }
  249. elseif ($scope == 'any') {
  250. if ($first_any) {
  251. $any .= "($search_terms) ";
  252. $first_any = 0;
  253. }
  254. else {
  255. $any .= "$op ($search_terms) ";
  256. }
  257. }
  258. elseif ($scope == 'negate_any') {
  259. if ($first_negate_any) {
  260. $negate_any .= "($search_terms) ";
  261. $first_any = 0;
  262. }
  263. else {
  264. $negate_any .= "$op ($search_terms) ";
  265. }
  266. }
  267. }
  268. // now build the CCL string in order
  269. $abstract_done = 0;
  270. $author_done = 0;
  271. $journal_done = 0;
  272. $title_done = 0;
  273. $id_done = 0;
  274. $any_done = 0;
  275. $negate_abstract_done = 0;
  276. $negate_journal_done = 0;
  277. $negate_author_done = 0;
  278. $negate_title_done = 0;
  279. $negate_id_done = 0;
  280. $negate_any_done = 0;
  281. for ($i = 0; $i < count($order); $i++) {
  282. if ($order[$i]['scope'] == 'abstract' and !$abstract_done) {
  283. $op = $order[$i]['op'];
  284. $ccl .= "$op abstract=($abstract) ";
  285. $abstract_done = 1;
  286. }
  287. if ($order[$i]['scope'] == 'negate_abstract' and !$negate_abstract_done) {
  288. $ccl .= "not abstract=($negate_abstract) ";
  289. $negate_abstract_done = 1;
  290. }
  291. if ($order[$i]['scope'] == 'author' and !$author_done) {
  292. $op = $order[$i]['op'];
  293. $ccl .= "$op author=($author) ";
  294. $author_done = 1;
  295. }
  296. if ($order[$i]['scope'] == 'negate_author' and !$negate_author_done) {
  297. $ccl .= "not author=($negate_author) ";
  298. $negate_author_done = 1;
  299. }
  300. if ($order[$i]['scope'] == 'journal' and !$journal_done) {
  301. $op = $order[$i]['op'];
  302. $ccl .= "$op journal=($journal) ";
  303. $journal_done = 1;
  304. }
  305. if ($order[$i]['scope'] == 'negate_journal' and !$negate_journal_done) {
  306. $ccl .= "not author=($negate_journal) ";
  307. $negate_journal_done = 1;
  308. }
  309. if ($order[$i]['scope'] == 'id' and !$id_done) {
  310. $op = $order[$i]['op'];
  311. $ccl .= "$op id=($id) ";
  312. $id_done = 1;
  313. }
  314. if ($order[$i]['scope'] == 'negate_id' and !$negate_id_done) {
  315. $ccl .= "not id=($negate_id) ";
  316. $negate_id_done = 1;
  317. }
  318. if ($order[$i]['scope'] == 'title' and !$title_done) {
  319. $op = $order[$i]['op'];
  320. $ccl .= "$op title=($title) ";
  321. $title_done = 1;
  322. }
  323. if ($order[$i]['scope'] == 'negate_title' and !$negate_title_done) {
  324. $ccl .= "not title=($negate_title) ";
  325. $negate_title_done = 1;
  326. }
  327. if ($order[$i]['scope'] == 'any' and !$any_done) {
  328. $op = $order[$i]['op'];
  329. $ccl .= "$op ($any) ";
  330. $any_done = 1;
  331. }
  332. if ($order[$i]['scope'] == 'negate_any' and !$negate_any_done) {
  333. $ccl .= "not ($negate_any) ";
  334. $negate_any_done = 1;
  335. }
  336. }
  337. // for AGL the 'days' form element was converted to represent the earliest year
  338. // ! these search terms do not work for AGL, disable here and filter the results returned
  339. // if ($days) { $ccl .= "and year>=($days) "; }
  340. // if ($latestyear) { $ccl .= "and year<=($latestyear) "; }
  341. // remove any preceeding 'and' or 'or'
  342. $ccl = preg_replace('/^\s*(and|or)/', '', $ccl);
  343. // yaz_connect() prepares for a connection to a Z39.50 server. This function is non-blocking
  344. // and does not attempt to establish a connection - it merely prepares a connect to be
  345. // performed later when yaz_wait() is called.
  346. // NAL Catalog
  347. //$yazc = yaz_connect('agricola.nal.usda.gov:7090/voyager');
  348. // NAL Article Citation Database
  349. $yazc = yaz_connect('agricola.nal.usda.gov:7190/voyager');
  350. // use the USMARC record type. But OPAC is also supported by Agricola
  351. yaz_syntax($yazc, "usmarc");
  352. // the search query is built using CCL, we need to first
  353. // configure it so it can map the attributes to defined identifiers
  354. // The attribute set used by AGL can be found at the bottom of this page:
  355. // https://agricola.nal.usda.gov/help/z3950.html
  356. // Boolean searching (AND, OR, NOT) is supported on search types with
  357. // a position attribute of 3 only.
  358. //
  359. // More in depth details: http://www.loc.gov/z3950/agency/bib1.html
  360. //
  361. // CCL Syntax: https://www.indexdata.com/yaz/doc/tools.html#CCL
  362. //
  363. // the abstract field u=62, year u=30, and journal u=1033 are not in the documented
  364. // list of supported values at https://agricola.nal.usda.gov/help/z3950.html
  365. // publisherdate u=31 is listed, but if used returns zero results
  366. $fields = [
  367. "title" => "u=4",
  368. "author" => "u=1003",
  369. "abstract" => "u=62",
  370. "id" => "u=12",
  371. "year" => "u=30 r=o",
  372. "journal" => "u=1033",
  373. ];
  374. yaz_ccl_conf($yazc, $fields);
  375. if (!yaz_ccl_parse($yazc, $ccl, $cclresult)) {
  376. drupal_set_message('Error parsing search string: ' . $cclresult["errorstring"], "error");
  377. watchdog('tpub_import', 'Error: %errstr', ['%errstr' => $cclresult["errorstring"]], WATCHDOG_ERROR);
  378. return [
  379. 'total_records' => 0,
  380. 'search_str' => '',
  381. 'pubs' => [],
  382. ];
  383. }
  384. $search_str = $cclresult["rpn"];
  385. // get the total number of records
  386. $total_records = tripal_pub_AGL_count($yazc, $search_str);
  387. // get the pubs in the specified range
  388. $start = $page * $num_to_retrieve;
  389. $results = tripal_pub_AGL_range($yazc, $search_str, $start, $num_to_retrieve, $total_records, $days, $latestyear);
  390. // close the connection
  391. yaz_close($yazc);
  392. return $results;
  393. }
  394. /**
  395. * Retrieves a range of publications from AGL
  396. *
  397. * @param $yazc
  398. * The YAZC connection object.
  399. * @param $search_str
  400. * The search string to use for searching.
  401. * @param $start
  402. * The start of the range
  403. * @param $num_to_retrieve
  404. * The number of publications to retrieve
  405. * @param $total_records
  406. * The total number of records in the dataset. This value should have
  407. * been retrieved by tripal_pub_AGL_count() function.
  408. *
  409. * @return
  410. * An array containing the total_records in the dataaset, the search string
  411. * and an array of the publications that were retreived.
  412. *
  413. * @ingroup tripal_pub
  414. */
  415. function tripal_pub_AGL_range($yazc, $search_str, $start, $num_to_retrieve, $total_records, $startyear, $endyear) {
  416. // The original code was: yaz_range($yazc, 1, $total_records);
  417. // and for large queries we received an error: ERROR retrieving records from AGL: (10007) Timeout
  418. // or for an intermediate size query: ERROR retrieving records from AGL: (10004) Connection lost
  419. // Empirical testing shows errors started somewhere between 1550 and 1575 records
  420. // If we use just the appropriate values for this range to the call to yaz_range then it works,
  421. // but we can't exceed $total_records
  422. $local_to_retrieve = $num_to_retrieve;
  423. if (($start + $num_to_retrieve) > $total_records) {
  424. $local_to_retrieve = $total_records - $start;
  425. }
  426. yaz_range($yazc, $start, $local_to_retrieve); // $start is 0-based
  427. if (!yaz_present($yazc)) {
  428. $error_no = yaz_errno($yazc);
  429. $error_msg = yaz_error($yazc);
  430. $additional = yaz_addinfo($yazc);
  431. if ($additional != $error_msg) {
  432. $error_msg .= " $additional";
  433. }
  434. drupal_set_message("ERROR waiting on search at AGL: ($error_no) $error_msg", "error");
  435. watchdog('tpub_import', "ERROR waiting on search at AGL: (%error_no) %error_msg",
  436. ['%error_no' => $error_no, '%error_msg' => $error_msg], WATCHDOG_ERROR);
  437. return [
  438. 'total_records' => 0,
  439. 'search_str' => $search_str,
  440. 'pubs' => [],
  441. ];
  442. }
  443. if ($start + $num_to_retrieve > $total_records) {
  444. $num_to_retrieve = $total_records - $start;
  445. }
  446. $pubs = [];
  447. for ($i = $start; $i < $start + $num_to_retrieve; $i++) {
  448. // retrieve the XML results
  449. $pub_xml = yaz_record($yazc, $i + 1, 'xml; charset=marc-8,utf-8');
  450. if (!$pub_xml) {
  451. $error_no = yaz_errno($yazc);
  452. $error_msg = yaz_error($yazc);
  453. drupal_set_message("ERROR retrieving records from AGL: ($error_no) $error_msg", "error");
  454. watchdog('tpub_import', "ERROR retrieving records from AGL: (%error_no) %error_msg",
  455. ['%error_no' => $error_no, '%error_msg' => $error_msg], WATCHDOG_ERROR);
  456. return [
  457. 'total_records' => 0,
  458. 'search_str' => $search_str,
  459. 'pubs' => [],
  460. ];
  461. }
  462. // parse the pub XML
  463. $pub = tripal_pub_AGL_parse_pubxml($pub_xml);
  464. // since year limits don't work when searching, implement them here for the
  465. // returned publications. This is a very ugly solution, because the count of
  466. // publications is now wrong, the xml for every pub must still be downloaded,
  467. // and the Test Importer will display pubs that will be later filtered out.
  468. $pass = 1;
  469. if (array_key_exists('Year', $pub)) {
  470. if ( ($startyear) and ($pub['Year']<$startyear) ) { $pass = 0; }
  471. if ( ($endyear) and ($pub['Year']>$endyear) ) { $pass = 0; }
  472. }
  473. $pub['passfilter'] = $pass;
  474. $pubs[] = $pub;
  475. }
  476. return [
  477. 'total_records' => $total_records,
  478. 'search_str' => $search_str,
  479. 'pubs' => $pubs,
  480. ];
  481. }
  482. /**
  483. * Retrieves the total number of publications that match the search string.
  484. *
  485. * @param $yazc
  486. * The YAZC connection object.
  487. * @param $search_str
  488. * The search string to use for searching.
  489. *
  490. * @return
  491. * a count of the total number of publications that match the search string
  492. *
  493. * @ingroup tripal_pub
  494. */
  495. function tripal_pub_AGL_count($yazc, $search_str) {
  496. //yaz_sort($yazc, "1=31 id"); // sort by publication date descending
  497. if (!yaz_search($yazc, "rpn", $search_str)) {
  498. $error_no = yaz_errno($yazc);
  499. $error_msg = yaz_error($yazc);
  500. $additional = yaz_addinfo($yazc);
  501. if ($additional != $error_msg) {
  502. $error_msg .= " $additional";
  503. }
  504. drupal_set_message("ERROR preparing search at AGL: ($error_no) $error_msg", "error");
  505. watchdog('tpub_import', "ERROR preparing search at AGL: (%error_no) %error_msg",
  506. ['%error_no' => $error_no, '%error_msg' => $error_msg], WATCHDOG_ERROR);
  507. return 0;
  508. }
  509. if (!yaz_wait()) {
  510. $error_no = yaz_errno($yazc);
  511. $error_msg = yaz_error($yazc);
  512. $additional = yaz_addinfo($yazc);
  513. if ($additional != $error_msg) {
  514. $error_msg .= " $additional";
  515. }
  516. drupal_set_message("ERROR waiting on search at AGL: ($error_no) $error_msg", "error");
  517. watchdog('tpub_import', "ERROR waiting on search at AGL: (%error_no) %error_msg",
  518. ['%error_no' => $error_no, '%error_msg' => $error_msg], WATCHDOG_ERROR);
  519. return 0;
  520. }
  521. // get the total number of results from the serach
  522. $count = yaz_hits($yazc);
  523. return $count;
  524. }
  525. /**
  526. * Decode the unusal text encoding returned from our
  527. * call to yaz_record(..., 'xml; charset=marc-8,utf-8')
  528. * Some characters are in UTF-8, some are encoded as HTML
  529. * entities, and some HTML entities are double-encoded,
  530. * for example &amp;#x2018;
  531. * A straight call to mb_convert_encoding() will corrupt
  532. * any UTF-8 characters, so only convert what appears
  533. * to be an HTML entity
  534. *
  535. * @param $text
  536. * The string to be decoded to "pure" UTF-8
  537. *
  538. * @return
  539. * The decoded string
  540. *
  541. * @ingroup tripal_pub
  542. */
  543. function tripal_pub_AGL_decode($text) {
  544. // first handle double encoding situations by replacing &amp;
  545. $text = preg_replace("/&amp;/", "&", $text);
  546. // then only replace things that look like an HTML entity, i.e.
  547. // ampersand followed by semicolon, in order to leave UTF-8 intact
  548. $text = preg_replace_callback("/(&[^;\p{C}\p{M}\p{Z}]{1,31};)/",
  549. function($m){return(mb_convert_encoding($m[1], 'UTF-8', 'HTML-ENTITIES'));},
  550. $text);
  551. return($text);
  552. }
  553. /**
  554. * Parse publication XML for a single publication
  555. *
  556. * Description of XML format:
  557. * http://www.loc.gov/marc/bibliographic/bdsummary.html
  558. *
  559. * @param $pub_xml
  560. * A string containing the XML for a single publications
  561. *
  562. * @return
  563. * An array containing the details of the publication
  564. *
  565. * @ingroup tripal_pub
  566. */
  567. function tripal_pub_AGL_parse_pubxml($pub_xml) {
  568. $pub = [];
  569. // we will set the default publication type as a journal article. The NAL
  570. // dataset doesn't specify an article type so we'll have to glean the type
  571. // from other information (e.g. series name has 'Proceedings' in it)
  572. $pub['Publication Type'][0] = 'Journal Article';
  573. if (!$pub_xml) {
  574. return $pub;
  575. }
  576. // read the XML and iterate through it.
  577. $xml = new XMLReader();
  578. $xml->xml(trim($pub_xml));
  579. while ($xml->read()) {
  580. $element = $xml->name;
  581. if ($xml->nodeType == XMLReader::ELEMENT and $element == 'controlfield') {
  582. $tag = $xml->getAttribute('tag');
  583. $xml->read();
  584. $value = $xml->value;
  585. switch ($tag) {
  586. case '001': // control number
  587. $pub['Publication Accession'] = $value;
  588. break;
  589. case '003': // control number identifier
  590. break;
  591. case '005': // datea nd time of latest transaction
  592. break;
  593. case '006': // fixed-length data elemetns
  594. break;
  595. case '007': // physical description fixed field
  596. break;
  597. case '008': // fixed length data elements
  598. $month = [
  599. '01' => 'Jan',
  600. '02' => 'Feb',
  601. '03' => 'Mar',
  602. '04' => 'Apr',
  603. '05' => 'May',
  604. '06' => 'Jun',
  605. '07' => 'Jul',
  606. '08' => 'Aug',
  607. '09' => 'Sep',
  608. '10' => 'Oct',
  609. '11' => 'Nov',
  610. '12' => 'Dec',
  611. ];
  612. $date0 = mb_substr($value, 0, 6); // date entered on file
  613. $typeofdate = mb_substr($value, 6, 1); // e = detailed date
  614. $date1 = mb_substr($value, 7, 4); // year of publication
  615. $date2 = mb_substr($value, 11, 2); // month of publication
  616. $date3 = mb_substr($value, 13, 2); // day of publication
  617. $place = mb_substr($value, 15, 3);
  618. $lang = mb_substr($value, 35, 3);
  619. if (preg_match('/\d\d\d\d/', $date1)) {
  620. $pub['Year'] = $date1;
  621. $pub['Publication Date'] = $date1;
  622. }
  623. if (($typeofdate == 'e') and (preg_match('/\d\d/', $date2))) {
  624. if ( ( $date2 >= 1 ) and ( $date2 <= 12 ) ) {
  625. $pub['Publication Date'] = $date1 . " " . $month[$date2] . " " . $date3;
  626. }
  627. else {
  628. drupal_set_message("Invalid month value \"$date2\" extracted from \"$value\"", "warning");
  629. }
  630. }
  631. if (!preg_match('/\s+/', $place)) {
  632. $pub['Published Location'] = $place;
  633. }
  634. if (!preg_match('/\s+/', $lang)) {
  635. $pub['Language Abbr'] = $lang;
  636. }
  637. break;
  638. default: // unhandled tag
  639. break;
  640. }
  641. }
  642. elseif ($xml->nodeType == XMLReader::ELEMENT and $element == 'datafield') {
  643. $tag = $xml->getAttribute('tag');
  644. $ind1 = $xml->getAttribute('ind1');
  645. $ind2 = $xml->getAttribute('ind2');
  646. switch ($tag) {
  647. case '16': // National Bibliographic Agency Control Number
  648. break;
  649. case '35': // System Control Number
  650. $author = [];
  651. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  652. foreach ($codes as $code => $value) {
  653. switch ($code) {
  654. case 'a': // System control number
  655. // rarely there will be a second control number with a "ns" prefix. Ignore them
  656. if (!preg_match('/^ns/', $value)) {
  657. $pub['Publication Accession'] = $value;
  658. }
  659. break;
  660. }
  661. }
  662. case '40': // Cataloging Source (NR)
  663. $author = [];
  664. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  665. foreach ($codes as $code => $value) {
  666. switch ($code) {
  667. case 'a': // original cataolging agency
  668. $pub['Publication Database'] = $value;
  669. break;
  670. }
  671. }
  672. break;
  673. case '72': // Subject Category Code
  674. break;
  675. case '100': // main entry-personal name
  676. $author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
  677. $pub['Author List'][] = $author;
  678. break;
  679. case '110': // main entry-corporate nmae
  680. $author = [];
  681. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  682. foreach ($codes as $code => $value) {
  683. switch ($code) {
  684. case 'a': // Corporate name or jurisdiction name as entry elemen
  685. $author['Collective'] = $value;
  686. break;
  687. case 'b': // Subordinate unit
  688. $author['Collective'] .= ' ' . $value;
  689. break;
  690. }
  691. }
  692. $pub['Author List'][] = $author;
  693. break;
  694. case '111': // main entry-meeting name
  695. break;
  696. case '130': // main entry-uniform title
  697. break;
  698. case '210': // abbreviated title
  699. break;
  700. case '222': // key title
  701. break;
  702. case '240': // uniform title
  703. break;
  704. case '242': // translation of title by cataloging agency
  705. break;
  706. case '243': // collective uniform title
  707. break;
  708. case '245': // title statement
  709. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  710. foreach ($codes as $code => $value) {
  711. switch ($code) {
  712. case 'a':
  713. $pub['Title'] = trim(preg_replace('/\.$/', '', $value));
  714. break;
  715. case 'b':
  716. $pub['Title'] .= ' ' . $value;
  717. break;
  718. case 'h':
  719. $pub['Publication Model'] = $value;
  720. break;
  721. }
  722. }
  723. break;
  724. case '246': // varying form of title
  725. break;
  726. case '247': // former title
  727. break;
  728. case '250': // edition statement
  729. break;
  730. case '254': // musicla presentation statement
  731. break;
  732. case '255': // cartographic mathematical data
  733. break;
  734. case '256': // computer file characteristics
  735. break;
  736. case '257': // country of producing entity
  737. break;
  738. case '258': // philatelic issue data
  739. break;
  740. case '260': // publication, distribution ,etc (imprint)
  741. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  742. foreach ($codes as $code => $value) {
  743. switch ($code) {
  744. case 'a':
  745. $pub['Published Location'] = $value;
  746. break;
  747. case 'b':
  748. $pub['Publisher'] = $value;
  749. break;
  750. case 'c':
  751. $pub['Publication Date'] = $value;
  752. break;
  753. }
  754. }
  755. break;
  756. case '263': // projected publication date
  757. break;
  758. case '264': // production, publication, distribution, manufacture and copyright notice
  759. break;
  760. case '270': // Address
  761. break;
  762. case '300': // Address
  763. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  764. foreach ($codes as $code => $value) {
  765. switch ($code) {
  766. case 'a':
  767. $pages = $value;
  768. $pages = preg_replace('/^p\. /', '', $pages);
  769. $pages = preg_replace('/\.$/', '', $pages);
  770. if (preg_match('/p$/', $pages)) {
  771. // skip this, it's the number of pages not the page numbers
  772. }
  773. else {
  774. $pub['Pages'] = $pages;
  775. }
  776. break;
  777. }
  778. }
  779. break;
  780. case '500': // series statements
  781. $pub['Notes'] = $value;
  782. break;
  783. case '504': // Bibliography, Etc. Note
  784. break;
  785. case '520': // Summary, etc
  786. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  787. foreach ($codes as $code => $value) {
  788. switch ($code) {
  789. case 'a':
  790. $pub['Abstract'] = $value;
  791. break;
  792. }
  793. }
  794. break;
  795. case '650': // Subject Added Entry-Topical Term
  796. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  797. foreach ($codes as $code => $value) {
  798. switch ($code) {
  799. case 'a':
  800. $pub['Keywords'][] = $value;
  801. break;
  802. }
  803. }
  804. break;
  805. case '653': // Index Term-Uncontrolled
  806. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  807. foreach ($codes as $code => $value) {
  808. switch ($code) {
  809. case 'a':
  810. $pub['Keywords'][] = $value;
  811. break;
  812. }
  813. }
  814. break;
  815. case '700': // Added Entry-Personal Name
  816. $author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
  817. $pub['Author List'][] = $author;
  818. break;
  819. case '710': // Added Entry-Corporate Name
  820. $author = [];
  821. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  822. foreach ($codes as $code => $value) {
  823. switch ($code) {
  824. case 'a': // Corporate name or jurisdiction name as entry elemen
  825. $author['Collective'] = $value;
  826. break;
  827. case 'b': // Subordinate unit
  828. $author['Collective'] .= ' ' . $value;
  829. break;
  830. }
  831. }
  832. $pub['Author List'][] = $author;
  833. break;
  834. case '773': // host item entry
  835. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  836. foreach ($codes as $code => $value) {
  837. switch ($code) {
  838. case 'a':
  839. if (preg_match('/Proceedings/i', $value)) {
  840. $pub['Series Name'] = preg_replace('/\.$/', '', $value);
  841. $pub['Publication Type'][0] = 'Conference Proceedings';
  842. }
  843. else {
  844. $pub['Journal Name'] = preg_replace('/\.$/', '', $value);
  845. }
  846. break;
  847. case 't':
  848. if (preg_match('/Proceedings/i', $value)) {
  849. $pub['Series Name'] = preg_replace('/\.$/', '', $value);
  850. $pub['Publication Type'][0] = 'Conference Proceedings';
  851. }
  852. $pub['Journal Name'] = preg_replace('/\.$/', '', $value);
  853. break;
  854. case 'g':
  855. $matches = [];
  856. if (preg_match('/^(\d\d\d\d)/', $value, $matches)) {
  857. $pub['Publication Date'] = $matches[1];
  858. }
  859. elseif (preg_match('/(.*?)(\.|\s+)\s*(\d+),\s(\d\d\d\d)/', $value, $matches)) {
  860. $year = $matches[4];
  861. $month = $matches[1];
  862. $day = $matches[3];
  863. $pub['Publication Date'] = "$year $month $day";
  864. }
  865. elseif (preg_match('/\((.*?)(\.|\s+)(\d\d\d\d)\)/', $value, $matches)) {
  866. $year = $matches[3];
  867. $month = $matches[1];
  868. $pub['Publication Date'] = "$year $month";
  869. }
  870. elseif (preg_match('/^(.*?) (\d\d\d\d)/', $value, $matches)) {
  871. $year = $matches[2];
  872. $month = $matches[1];
  873. $pub['Publication Date'] = "$year $month";
  874. }
  875. if (preg_match('/v\. (.*?)(,|\s+)/', $value, $matches)) {
  876. $pub['Volume'] = $matches[1];
  877. }
  878. if (preg_match('/v\. (.*?)(,|\s+)\((.*?)\)/', $value, $matches)) {
  879. $pub['Volume'] = $matches[1];
  880. $pub['Issue'] = $matches[3];
  881. }
  882. if (preg_match('/no\. (.*?)(\s|$)/', $value, $matches)) {
  883. $pub['Issue'] = $matches[1];
  884. }
  885. break;
  886. case 'p':
  887. $pub['Journal Abbreviation'] = $value;
  888. break;
  889. case 'z':
  890. $pub['ISBN'] = $value;
  891. break;
  892. }
  893. }
  894. break;
  895. case '852': // Location (Where is the publication held)
  896. break;
  897. case '856': // Electronic Location and Access
  898. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  899. foreach ($codes as $code => $value) {
  900. switch ($code) {
  901. case 'u':
  902. $pub['URL'] = $value;
  903. break;
  904. }
  905. }
  906. break;
  907. default:
  908. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  909. $unhandled[$tag][] = $codes;
  910. break;
  911. }
  912. }
  913. }
  914. // build the Dbxref
  915. if ($pub['Publication Database'] != 'AGL') {
  916. }
  917. if ($pub['Publication Accession'] and $pub['Publication Database']) {
  918. $pub['Publication Dbxref'] = $pub['Publication Database'] . ":" . $pub['Publication Accession'];
  919. unset($pub['Publication Accession']);
  920. unset($pub['Publication Database']);
  921. }
  922. // build the full authors list
  923. if (!array_key_exists('Author List', $pub)) {
  924. // there is a constraint in chado.pubprop against value being null
  925. $pub['Author List'] = [['Surname' => 'anonymous']];
  926. $pub['Authors'] = 'anonymous';
  927. }
  928. else if (is_array($pub['Author List'])) {
  929. $authors = '';
  930. foreach ($pub['Author List'] as $author) {
  931. if (array_key_exists('valid', $author) and $author['valid'] == 'N') {
  932. // skip non-valid entries. A non-valid entry should have
  933. // a corresponding corrected entry so we can saftely skip it.
  934. continue;
  935. }
  936. if (array_key_exists('Collective', $author)) {
  937. $authors .= $author['Collective'] . ', ';
  938. }
  939. else {
  940. if (array_key_exists('Surname', $author)) {
  941. $authors .= $author['Surname'];
  942. if (array_key_exists('First Initials', $author)) {
  943. $authors .= ' ' . $author['First Initials'];
  944. }
  945. $authors .= ', ';
  946. }
  947. }
  948. }
  949. $authors = mb_substr($authors, 0, -2);
  950. $pub['Authors'] = $authors;
  951. }
  952. else {
  953. $pub['Authors'] = $pub['Author List'];
  954. }
  955. // for several fields that may contain them, convert html entities to unicode characters
  956. $pub['Title'] = tripal_pub_AGL_decode($pub['Title']);
  957. if (key_exists('Abstract', $pub)) {
  958. $pub['Abstract'] = tripal_pub_AGL_decode($pub['Abstract']);
  959. }
  960. $newauths = [];
  961. if (array_key_exists('Author List', $pub)) {
  962. foreach ($pub['Author List'] AS $auth) {
  963. foreach ($auth AS $k => $v) {
  964. $auth[$k] = tripal_pub_AGL_decode($auth[$k]);
  965. }
  966. array_push($newauths, $auth);
  967. }
  968. $pub['Author List'] = $newauths;
  969. }
  970. else {
  971. $pub['Author List'] = [['Surname' => 'anonymous']];
  972. }
  973. if (array_key_exists('Authors', $pub)) {
  974. $pub['Authors'] = tripal_pub_AGL_decode($pub['Authors']);
  975. }
  976. if (array_key_exists('Keywords', $pub)) {
  977. $pub['Keywords'] = tripal_pub_AGL_decode($pub['Keywords']);
  978. }
  979. if (array_key_exists('Notes', $pub)) {
  980. $pub['Notes'] = tripal_pub_AGL_decode($pub['Notes']);
  981. }
  982. // build the citation
  983. $pub['Citation'] = tripal_pub_AGL_decode(chado_pub_create_citation($pub));
  984. $pub['raw'] = $pub_xml;
  985. return $pub;
  986. }
  987. /**
  988. * Used for parsing of the XML results to get a set of subfields
  989. *
  990. * @param $xml
  991. * The XMl object to read
  992. *
  993. * @return
  994. * An array of codes and their values
  995. *
  996. * @ingroup tripal_pub
  997. */
  998. function tripal_pub_remote_search_AGL_get_subfield($xml) {
  999. $codes = [];
  1000. while ($xml->read()) {
  1001. $sub_element = $xml->name;
  1002. // when we've reached the end of the datafield element then break out of the while loop
  1003. if ($xml->nodeType == XMLReader::END_ELEMENT and $sub_element == 'datafield') {
  1004. return $codes;
  1005. }
  1006. // if inside the subfield element then get the code
  1007. if ($xml->nodeType == XMLReader::ELEMENT and $sub_element == 'subfield') {
  1008. $code = $xml->getAttribute('code');
  1009. $xml->read();
  1010. $value = $xml->value;
  1011. $codes[$code] = $value;
  1012. }
  1013. }
  1014. return $codes;
  1015. }
  1016. /**
  1017. * Used for parsing of the XML results to get details about an author
  1018. *
  1019. * @param $xml
  1020. * The XML object to read
  1021. * @param $ind1
  1022. * Indicates how an author record is stored; 0 means given name is first
  1023. * 1 means surname is first, 3 means a family name is given
  1024. *
  1025. * @return
  1026. *
  1027. *
  1028. * @ingroup tripal_pub
  1029. */
  1030. function tripal_pub_remote_search_AGL_get_author($xml, $ind1) {
  1031. $author = [];
  1032. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  1033. foreach ($codes as $code => $value) {
  1034. switch ($code) {
  1035. case 'a':
  1036. // remove any trailing commas
  1037. $value = preg_replace('/,$/', '', $value);
  1038. if ($ind1 == 0) { // Given Name is first
  1039. $author['Given Name'] = $names[0];
  1040. }
  1041. if ($ind1 == 1) { // Surname is first
  1042. // split the parts of the name using a comma
  1043. $names = explode(',', $value);
  1044. $author['Surname'] = $names[0];
  1045. $author['Given Name'] = '';
  1046. unset($names[0]);
  1047. foreach ($names as $index => $name) {
  1048. $author['Given Name'] .= $name . ' ';
  1049. }
  1050. $first_names = explode(' ', $author['Given Name']);
  1051. $author['First Initials'] = '';
  1052. foreach ($first_names as $index => $name) {
  1053. $author['First Initials'] .= mb_substr($name, 0, 1);
  1054. }
  1055. }
  1056. if ($ind1 == 3) { // A family name, occurs rarely
  1057. $author['Surname'] = $value;
  1058. }
  1059. break;
  1060. }
  1061. }
  1062. return $author;
  1063. }