tripal_pub.AGL.inc 33 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027
  1. <?php
  2. /**
  3. * @file
  4. *
  5. * Importer for the USDA Agricultural Library (Agricola).
  6. *
  7. * This file provides support for importing and parsing of results from the
  8. * USDA National Agricultural Library (AGL) database. The functions here are
  9. * used by both the publication importer setup form and the publication
  10. * importer. The USDA AGL database uses a YAZ protocol for querying and
  11. * retrieving records.
  12. *
  13. */
  14. /**
  15. * A hook for altering the publication importer form.
  16. *
  17. * It Changes the 'Days' element to 'Year' and removes the 'Journal Name'
  18. * filter.
  19. *
  20. * @param $form
  21. * The Drupal form array
  22. * @param $form_state
  23. * The form state array
  24. * @param $num_criteria
  25. * The number of criteria the user currently has added to the form
  26. *
  27. * @return
  28. * The form (drupal form api)
  29. *
  30. * @ingroup tripal_pub
  31. */
  32. function tripal_pub_remote_alter_form_AGL($form, $form_state, $num_criteria = 1) {
  33. // So far we haven't been able to get AGL to filter results to only
  34. // include pubs by the XX number days in the past. So, we will
  35. // change the 'days' element to be the year to query
  36. $form['themed_element']['days']['#title'] = t('Year');
  37. $form['themed_element']['days']['#description'] = t('Please enter a year to limit records by the year they were published, created or modified in the database.');
  38. // The Journal Name filter doesn't seem to work, so remove it
  39. for($i = 1; $i <= $num_criteria; $i++) {
  40. unset($form['themed_element']['criteria'][$i]["scope-$i"]['#options']['journal']);
  41. }
  42. return $form;
  43. }
  44. /**
  45. * A hook for providing additional validation of importer setup form.
  46. *
  47. * @param $form
  48. * The Drupal form array
  49. * @param $form_state
  50. * The form state array
  51. *
  52. * @return
  53. * The form (drupal form api)
  54. *
  55. * @ingroup tripal_pub
  56. */
  57. function tripal_pub_remote_validate_form_AGL($form, $form_state) {
  58. $days = trim($form_state['values']["days"]);
  59. $num_criteria = $form_state['values']['num_criteria'];
  60. if ($days and !preg_match('/^\d\d\d\d$/', $days)) {
  61. form_set_error("days", "Please enter a four digit year.");
  62. }
  63. $num_ids = 0;
  64. for ($i = 1; $i <= $num_criteria; $i++) {
  65. $search_terms = trim($form_state['values']["search_terms-$i"]);
  66. $scope = $form_state['values']["scope-$i"];
  67. if ($scope == 'id' and !preg_match('/^AGL:\d+$/', $search_terms)) {
  68. form_set_error("search_terms-$i", "The AGL accession be a numeric value, prefixed with 'AGL:' (e.g. AGL:3890740).");
  69. }
  70. if ($scope == 'id') {
  71. $num_ids++;
  72. }
  73. if($num_ids > 1) {
  74. form_set_error("search_terms-$i", "Unfortuantely, the AGL importer can only support a single accession at a time. Please remove the others.");
  75. }
  76. }
  77. return $form;
  78. }
  79. /**
  80. * A hook for performing the search on the AGL database.
  81. *
  82. * @param $search_array
  83. * An array containing the serach criteria for the serach
  84. * @param $num_to_retrieve
  85. * Indicates the maximum number of publications to retrieve from the remote
  86. * database
  87. * @param $page
  88. * Indicates the page to retrieve. This corresponds to a paged table, where
  89. * each page has $num_to_retrieve publications.
  90. *
  91. * @return
  92. * An array of publications.
  93. *
  94. * @ingroup tripal_pub
  95. */
  96. function tripal_pub_remote_search_AGL($search_array, $num_to_retrieve, $page) {
  97. // get some values from the serach array
  98. $num_criteria = $search_array['num_criteria'];
  99. $days = array_key_exists('days', $search_array) ? $search_array['days'] : '';
  100. // set some defaults
  101. $search_array['limit'] = $num_to_retrieve;
  102. // To build the CCL search string we want to have a single entry for
  103. // 'author', 'title', 'abstract' or 'id', and also the corresponding 'not
  104. // for each of those. But the search form allows the user to have multiple
  105. // rows of the same type. So, we will build the search string separately for
  106. // each category and it's negative category (if NOT is selected as the op)
  107. // and at the end we will put them together into a single search string. We
  108. // need to keep track of the first entry of any category because it will not
  109. // have an op (e.g. 'or' or 'and') but the operation will be pushed out to
  110. // separate the categories. The op for any second or third instance of
  111. // the same category will be included within the search string for the
  112. // category.
  113. $ccl = '';
  114. $title = '';
  115. $author = '';
  116. $abstract = '';
  117. $id = '';
  118. $any = '';
  119. $negate_title = '';
  120. $negate_author = '';
  121. $negate_abstract = '';
  122. $negate_id = '';
  123. $negate_any = '';
  124. $order = array();
  125. $first_abstract = 1;
  126. $first_author = 1;
  127. $first_title = 1;
  128. $first_id = 1;
  129. $first_any = 1;
  130. $first_negate_abstract = 1;
  131. $first_negate_author = 1;
  132. $first_negate_title = 1;
  133. $first_negate_id = 1;
  134. $first_negate_any = 1;
  135. for ($i = 1; $i <= $num_criteria; $i++) {
  136. $search_terms = trim($search_array['criteria'][$i]['search_terms']);
  137. $scope = $search_array['criteria'][$i]['scope'];
  138. $is_phrase = $search_array['criteria'][$i]['is_phrase'];
  139. $op = $search_array['criteria'][$i]['operation'];
  140. if ($op) {
  141. $op = strtolower($op);
  142. }
  143. $search_terms = trim($search_terms);
  144. // If this is not a phrase then make sure the AND and OR are lower-case.
  145. if (!$is_phrase) {
  146. $search_terms = preg_replace('/ OR /', ' or ', $search_terms);
  147. $search_terms = preg_replace('/ AND /', ' and ', $search_terms);
  148. }
  149. // Else make sure the search terms are surrounded by quotes.
  150. else {
  151. $search_terms = "\"$search_terms\"";
  152. }
  153. // If this is a 'not' operation then we want to change it to an "and".
  154. $negate = '';
  155. if ($op == 'not') {
  156. $scope = "negate_$scope";
  157. $op = 'or';
  158. }
  159. $order[] = array('scope' => $scope, 'op' => $op);
  160. // Build each category.
  161. if ($scope == 'title') {
  162. if ($first_title) {
  163. $title .= "($search_terms) ";
  164. $first_title = 0;
  165. }
  166. else {
  167. $title .= "$op ($search_terms) ";
  168. }
  169. }
  170. if ($scope == 'negate_title') {
  171. if ($first_negate_title) {
  172. $negate_title .= "($search_terms) ";
  173. $first_negate_title = 0;
  174. }
  175. else {
  176. $negate_title .= "$op ($search_terms) ";
  177. }
  178. }
  179. elseif ($scope == 'author') {
  180. if ($first_author) {
  181. $author .= "($search_terms) ";
  182. $first_author = 0;
  183. }
  184. else {
  185. $author .= "$op ($search_terms) ";
  186. }
  187. }
  188. elseif ($scope == 'negate_author') {
  189. if ($first_negate_author) {
  190. $negate_author .= "($search_terms) ";
  191. $first_negate_author = 0;
  192. }
  193. else {
  194. $negate_author .= "$op ($search_terms) ";
  195. }
  196. }
  197. elseif ($scope == 'abstract') {
  198. if ($first_abstract) {
  199. $abstract .= "($search_terms) ";
  200. $first_abstract = 0;
  201. }
  202. else {
  203. $abstract .= "$op ($search_terms) ";
  204. }
  205. }
  206. elseif ($scope == 'negate_abstract') {
  207. if ($first_negate_abstract) {
  208. $negate_abstract .= "($search_terms) ";
  209. $first_negate_abstract = 0;
  210. }
  211. else {
  212. $negate_abstract .= "$op ($search_terms) ";
  213. }
  214. }
  215. elseif ($scope == 'journal') {
  216. if ($first_journal) {
  217. $journal .= "($search_terms) ";
  218. $first_jounral = 0;
  219. }
  220. else {
  221. $journal .= "$op ($search_terms) ";
  222. }
  223. }
  224. elseif ($scope == 'negate_journal') {
  225. if ($first_negate_journal) {
  226. $negate_journal .= "($search_terms) ";
  227. $first_negate_journal = 0;
  228. }
  229. else {
  230. $negate_journal .= "$op ($search_terms) ";
  231. }
  232. }
  233. elseif ($scope == 'id') {
  234. if ($first_id) {
  235. $id .= "(" . preg_replace('/AGL:([^\s]*)/', '$1', $search_terms) . ") ";
  236. $first_id = 0;
  237. }
  238. else {
  239. $id .= "$op (" . preg_replace('/AGL:([^\s]*)/', '$1', $search_terms) . ") ";
  240. }
  241. }
  242. elseif ($scope == 'negate_id') {
  243. if ($first_negate_id) {
  244. $negate_id .= "(" . preg_replace('/AGL:([^\s]*)/', '$1', $search_terms) . ") ";
  245. $first_negate_id = 0;
  246. }
  247. else {
  248. $negate_id .= "$op (" . preg_replace('/AGL:([^\s]*)/', '$1', $search_terms) . ") ";
  249. }
  250. }
  251. elseif ($scope == 'any'){
  252. if ($first_any) {
  253. $any .= "($search_terms) ";
  254. $first_any = 0;
  255. }
  256. else {
  257. $any .= "$op ($search_terms) ";
  258. }
  259. }
  260. elseif ($scope == 'negate_any'){
  261. if ($first_negate_any) {
  262. $negate_any .= "($search_terms) ";
  263. $first_any = 0;
  264. }
  265. else {
  266. $negate_any .= "$op ($search_terms) ";
  267. }
  268. }
  269. }
  270. // Now build the CCL string in order.
  271. $abstract_done = 0;
  272. $author_done = 0;
  273. $journal_done = 0;
  274. $title_done = 0;
  275. $id_done = 0;
  276. $any_done = 0;
  277. $negate_abstract_done = 0;
  278. $negate_journal_done = 0;
  279. $negate_author_done = 0;
  280. $negate_title_done = 0;
  281. $negate_id_done = 0;
  282. $negate_any_done = 0;
  283. for ($i = 0; $i < count($order) ; $i++) {
  284. if ($order[$i]['scope'] == 'abstract' and !$abstract_done) {
  285. $op = $order[$i]['op'];
  286. $ccl .= "$op abstract=($abstract) ";
  287. $abstract_done = 1;
  288. }
  289. if ($order[$i]['scope'] == 'negate_abstract' and !$negate_abstract_done) {
  290. $ccl .= "not abstract=($negate_abstract) ";
  291. $negate_abstract_done = 1;
  292. }
  293. if ($order[$i]['scope'] == 'author' and !$author_done) {
  294. $op = $order[$i]['op'];
  295. $ccl .= "$op author=($author) ";
  296. $author_done = 1;
  297. }
  298. if ($order[$i]['scope'] == 'negate_author' and !$negate_author_done) {
  299. $ccl .= "not author=($negate_author) ";
  300. $negate_author_done = 1;
  301. }
  302. if ($order[$i]['scope'] == 'journal' and !$journal_done) {
  303. $op = $order[$i]['op'];
  304. $ccl .= "$op journal=($journal) ";
  305. $journal_done = 1;
  306. }
  307. if ($order[$i]['scope'] == 'negate_journal' and !$negate_journal_done) {
  308. $ccl .= "not author=($negate_journal) ";
  309. $negate_journal_done = 1;
  310. }
  311. if ($order[$i]['scope'] == 'id' and !$id_done) {
  312. $op = $order[$i]['op'];
  313. $ccl .= "$op id=($id) ";
  314. $id_done = 1;
  315. }
  316. if ($order[$i]['scope'] == 'negate_id' and !$negate_id_done) {
  317. $ccl .= "not id=($negate_id) ";
  318. $negate_id_done = 1;
  319. }
  320. if ($order[$i]['scope'] == 'title' and !$title_done) {
  321. $op = $order[$i]['op'];
  322. $ccl .= "$op title=($title) ";
  323. $title_done = 1;
  324. }
  325. if ($order[$i]['scope'] == 'negate_title' and !$negate_title_done) {
  326. $ccl .= "not title=($negate_title) ";
  327. $negate_title_done = 1;
  328. }
  329. if ($order[$i]['scope'] == 'any' and !$any_done) {
  330. $op = $order[$i]['op'];
  331. $ccl .= "$op ($any) ";
  332. $any_done = 1;
  333. }
  334. if ($order[$i]['scope'] == 'negate_any' and !$negate_any_done) {
  335. $ccl .= "not ($negate_any) ";
  336. $negate_any_done = 1;
  337. }
  338. }
  339. // For AGL the 'days' form element was converted to represent the year.
  340. if ($days) {
  341. $ccl .= "and year=($days)";
  342. }
  343. // Remove any preceeding 'and' or 'or'.
  344. $ccl = preg_replace('/^\s*(and|or)/', '', $ccl);
  345. // yaz_connect() prepares for a connection to a Z39.50 server. This function
  346. // is non-blocking and does not attempt to establish a connection - it merely
  347. // prepares a connect to be performed later when yaz_wait() is called.
  348. // NAL Catalog
  349. // $yazc = yaz_connect('agricola.nal.usda.gov:7090/voyager');
  350. // NAL Article Citation Database
  351. $yazc = yaz_connect('agricola.nal.usda.gov:7190/voyager');
  352. // Use the USMARC record type. But OPAC is also supported by Agricola.
  353. yaz_syntax($yazc, "usmarc");
  354. // The search query is built using CCL, we need to first
  355. // configure it so it can map the attributes to defined identifiers
  356. // The attribute set used by AGL can be found at the bottom of this page:
  357. // http://agricola.nal.usda.gov/help/z3950.html
  358. //
  359. // More in depth details: http://www.loc.gov/z3950/agency/bib1.html
  360. //
  361. // CCL Syntax: http://www.indexdata.com/yaz/doc/tools.html#CCL
  362. //
  363. $fields = array(
  364. "title" => "u=4",
  365. "author" => "u=1003",
  366. "abstract" => "u=62",
  367. "id" => "u=12",
  368. "year" => "u=30 r=o",
  369. "journal" => "u=1033"
  370. );
  371. yaz_ccl_conf($yazc, $fields);
  372. if (!yaz_ccl_parse($yazc, $ccl, $cclresult)) {
  373. drupal_set_message('Error parsing search string: ' . $cclresult["errorstring"], "error");
  374. watchdog('tpub_import', 'Error: %errstr', array('%errstr' => $cclresult["errorstring"]), WATCHDOG_ERROR);
  375. return array(
  376. 'total_records' => 0,
  377. 'search_str' => '',
  378. 'pubs' => array(),
  379. );
  380. }
  381. $search_str = $cclresult["rpn"];
  382. // get the total number of records
  383. $total_records = tripal_pub_AGL_count($yazc, $search_str);
  384. // get the pubs in the specified rang
  385. $start = $page * $num_to_retrieve;
  386. $results = tripal_pub_AGL_range($yazc, $search_str, $start, $num_to_retrieve, $total_records);
  387. // close the connection
  388. yaz_close($yazc);
  389. return $results;
  390. }
  391. /**
  392. * Retreives a range of publications from AGL.
  393. *
  394. * @param $yazc
  395. * The YAZC connection object.
  396. * @param $search_str
  397. * The search string to use for searching.
  398. * @param $start
  399. * The start of the range
  400. * @param $num_to_retrieve
  401. * The number of publications to retrieve
  402. * @param $total_records
  403. * The total number of records in the dataset. This value should have
  404. * been retrieved by tripal_pub_AGL_count() function.
  405. *
  406. * @return
  407. * An array containing the total_records in the dataaset, the search string
  408. * and an array of the publications that were retreived.
  409. *
  410. * @ingroup tripal_pub
  411. */
  412. function tripal_pub_AGL_range($yazc, $search_str, $start, $num_to_retrieve, $total_records) {
  413. yaz_range($yazc, 1, $total_records);
  414. if (!yaz_present($yazc)) {
  415. $error_no = yaz_errno($yazc);
  416. $error_msg = yaz_error($yazc);
  417. $additional = yaz_addinfo($yazc);
  418. if ($additional != $error_msg) {
  419. $error_msg .= " $additional";
  420. }
  421. drupal_set_message("ERROR waiting on search at AGL: ($error_no) $error_msg", "error");
  422. watchdog('tpub_import', "ERROR waiting on search at AGL: (%error_no) %error_msg",
  423. array('%error_no' => $error_no, '%error_msg' => $error_msg), WATCHDOG_ERROR);
  424. return array(
  425. 'total_records' => 0,
  426. 'search_str' => $search_str,
  427. 'pubs' => array(),
  428. );
  429. }
  430. if ($start + $num_to_retrieve > $total_records) {
  431. $num_to_retrieve = $total_records - $start;
  432. }
  433. $pubs = array();
  434. for($i = $start; $i < $start + $num_to_retrieve; $i++) {
  435. // retrieve the XML results
  436. $pub_xml = yaz_record($yazc, $i + 1, 'xml; charset=marc-8,utf-8');
  437. if (!$pub_xml) {
  438. $error_no = yaz_errno($yazc);
  439. $error_msg = yaz_error($yazc);
  440. drupal_set_message("ERROR retrieving records from AGL: ($error_no) $error_msg", "error");
  441. watchdog('tpub_import', "ERROR retrieving records from AGL: (%error_no) %error_msg",
  442. array('%error_no' => $error_no, '%error_msg' => $error_msg), WATCHDOG_ERROR);
  443. return array(
  444. 'total_records' => 0,
  445. 'search_str' => $search_str,
  446. 'pubs' => array(),
  447. );
  448. }
  449. // parse the pub XML
  450. $pub = tripal_pub_AGL_parse_pubxml($pub_xml);
  451. $pubs[] = $pub;
  452. }
  453. return array(
  454. 'total_records' => $total_records,
  455. 'search_str' => $search_str,
  456. 'pubs' => $pubs,
  457. );
  458. }
  459. /**
  460. * Retreives the total number of publications that match the search string.
  461. *
  462. * @param $yazc
  463. * The YAZC connection object.
  464. * @param $search_str
  465. * The search string to use for searching.
  466. *
  467. * @return
  468. * a count of the total number of publications that match the search string
  469. *
  470. * @ingroup tripal_pub
  471. */
  472. function tripal_pub_AGL_count($yazc, $search_str) {
  473. // Sort by publication date descending.
  474. // yaz_sort($yazc, "1=31 id");
  475. if (!yaz_search($yazc, "rpn", $search_str)){
  476. $error_no = yaz_errno($yazc);
  477. $error_msg = yaz_error($yazc);
  478. $additional = yaz_addinfo($yazc);
  479. if ($additional != $error_msg) {
  480. $error_msg .= " $additional";
  481. }
  482. drupal_set_message("ERROR preparing search at AGL: ($error_no) $error_msg", "error");
  483. watchdog('tpub_import', "ERROR preparing search at AGL: (%error_no) %error_msg",
  484. array('%error_no' => $error_no, '%error_msg' => $error_msg), WATCHDOG_ERROR);
  485. return 0;
  486. }
  487. if (!yaz_wait()) {
  488. $error_no = yaz_errno($yazc);
  489. $error_msg = yaz_error($yazc);
  490. $additional = yaz_addinfo($yazc);
  491. if ($additional != $error_msg) {
  492. $error_msg .= " $additional";
  493. }
  494. drupal_set_message("ERROR waiting on search at AGL: ($error_no) $error_msg", "error");
  495. watchdog('tpub_import', "ERROR waiting on search at AGL: (%error_no) %error_msg",
  496. array('%error_no' => $error_no, '%error_msg' => $error_msg), WATCHDOG_ERROR);
  497. return 0;
  498. }
  499. // get the total number of results from the serach
  500. $count = yaz_hits($yazc);
  501. return $count;
  502. }
  503. /**
  504. * Parse publication XML for a single publication
  505. *
  506. * Description of XML format:
  507. * http://www.loc.gov/marc/bibliographic/bdsummary.html
  508. *
  509. * @param $pub_xml
  510. * A string containing the XML for a single publications
  511. *
  512. * @return
  513. * An array containing the details of the publication
  514. *
  515. * @ingroup tripal_pub
  516. */
  517. function tripal_pub_AGL_parse_pubxml($pub_xml) {
  518. $pub = array();
  519. // we will set the default publication type as a journal article. The NAL
  520. // dataset doesn't specify an article type so we'll have to glean the type
  521. // from other information (e.g. series name has 'Proceedings' in it)
  522. $pub['Publication Type'][0] = 'Journal Article';
  523. if (!$pub_xml) {
  524. return $pub;
  525. }
  526. // read the XML and iterate through it.
  527. $xml = new XMLReader();
  528. $xml->xml(trim($pub_xml));
  529. while ($xml->read()) {
  530. $element = $xml->name;
  531. if ($xml->nodeType == XMLReader::ELEMENT and $element == 'controlfield') {
  532. $tag = $xml->getAttribute('tag');
  533. $xml->read();
  534. $value = $xml->value;
  535. switch ($tag) {
  536. case '001': // control number
  537. $pub['Publication Accession'] = $value;
  538. break;
  539. case '003': // control number identifier
  540. break;
  541. case '005': // datea nd time of latest transaction
  542. break;
  543. case '006': // fixed-length data elemetns
  544. break;
  545. case '007': // physical description fixed field
  546. break;
  547. case '008': // fixed length data elements
  548. $month = array(
  549. '01' => 'Jan', '02' => 'Feb', '03' => 'Mar',
  550. '04' => 'Apr', '05' => 'May', '06' => 'Jun',
  551. '07' => 'Jul', '08' => 'Aug', '09' => 'Sep',
  552. '10' => 'Oct', '11' => 'Nov', '12' => 'Dec'
  553. );
  554. $date0 = substr($value, 0, 6); // date entered on file
  555. $date1 = substr($value, 7, 4); // year of publication
  556. $date2 = substr($value, 11, 4); // month of publication
  557. $place = substr($value, 15, 3);
  558. $lang = substr($value, 35, 3);
  559. if (preg_match('/\d\d\d\d/', $date1)) {
  560. $pub['Year'] = $date1;
  561. $pub['Publication Date'] = $date1;
  562. }
  563. if (preg_match('/\d\d/', $date2)) {
  564. $pub['Publication Date'] = $date1 . " " . $month[substr($date2, 0, 2)] . " " . substr($date2, 3, 2);
  565. }
  566. if (!preg_match('/\s+/', $place)) {
  567. $pub['Published Location'] = $place;
  568. }
  569. if (!preg_match('/\s+/', $lang)) {
  570. $pub['Language Abbr'] = $lang;
  571. }
  572. break;
  573. default: // unhandled tag
  574. break;
  575. }
  576. }
  577. elseif ($xml->nodeType == XMLReader::ELEMENT and $element == 'datafield') {
  578. $tag = $xml->getAttribute('tag');
  579. $ind1 = $xml->getAttribute('ind1');
  580. $ind2 = $xml->getAttribute('ind2');
  581. switch ($tag) {
  582. case '16': // National Bibliographic Agency Control Number
  583. break;
  584. case '35': // System Control Number
  585. $author = array();
  586. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  587. foreach ($codes as $code => $value) {
  588. switch ($code) {
  589. case 'a': // System control number
  590. $pub['Publication Accession'] = $value;
  591. break;
  592. }
  593. }
  594. case '40': // Cataloging Source (NR)
  595. $author = array();
  596. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  597. foreach ($codes as $code => $value) {
  598. switch ($code) {
  599. case 'a': // original cataolging agency
  600. $pub['Publication Database'] = $value;
  601. break;
  602. }
  603. }
  604. break;
  605. case '72': // Subject Category Code
  606. break;
  607. case '100': // main entry-personal name
  608. $author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
  609. $pub['Author List'][] = $author;
  610. break;
  611. case '110': // main entry-corporate nmae
  612. $author = array();
  613. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  614. foreach ($codes as $code => $value) {
  615. switch ($code) {
  616. case 'a': // Corporate name or jurisdiction name as entry elemen
  617. $author['Collective'] = $value;
  618. break;
  619. case 'b': // Subordinate unit
  620. $author['Collective'] .= ' ' . $value;
  621. break;
  622. }
  623. }
  624. $pub['Author List'][] = $author;
  625. break;
  626. case '111': // main entry-meeting name
  627. break;
  628. case '130': // main entry-uniform title
  629. break;
  630. case '210': // abbreviated title
  631. break;
  632. case '222': // key title
  633. break;
  634. case '240': // uniform title
  635. break;
  636. case '242': // translation of title by cataloging agency
  637. break;
  638. case '243': // collective uniform title
  639. break;
  640. case '245': // title statement
  641. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  642. foreach ($codes as $code => $value) {
  643. switch ($code) {
  644. case 'a':
  645. $pub['Title'] = trim(preg_replace('/\.$/', '', $value));
  646. break;
  647. case 'b':
  648. $pub['Title'] .= ' ' . $value;
  649. break;
  650. case 'h':
  651. $pub['Publication Model'] = $value;
  652. break;
  653. }
  654. }
  655. break;
  656. case '246': // varying form of title
  657. break;
  658. case '247': // former title
  659. break;
  660. case '250': // edition statement
  661. break;
  662. case '254': // musicla presentation statement
  663. break;
  664. case '255': // cartographic mathematical data
  665. break;
  666. case '256': // computer file characteristics
  667. break;
  668. case '257': // country of producing entity
  669. break;
  670. case '258': // philatelic issue data
  671. break;
  672. case '260': // publication, distribution ,etc (imprint)
  673. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  674. foreach ($codes as $code => $value) {
  675. switch ($code) {
  676. case 'a':
  677. $pub['Published Location'] = $value;
  678. break;
  679. case 'b':
  680. $pub['Publisher'] = $value;
  681. break;
  682. case 'c':
  683. $pub['Publication Date'] = $value;
  684. break;
  685. }
  686. }
  687. break;
  688. case '263': // projected publication date
  689. break;
  690. case '264': // production, publication, distribution, manufacture and copyright notice
  691. break;
  692. case '270': // Address
  693. break;
  694. case '300': // Address
  695. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  696. foreach ($codes as $code => $value) {
  697. switch ($code) {
  698. case 'a':
  699. $pages = $value;
  700. $pages = preg_replace('/^p\. /', '', $pages);
  701. $pages = preg_replace('/\.$/', '' , $pages);
  702. if(preg_match('/p$/', $pages)) {
  703. // skip this, it's the number of pages not the page numbers
  704. }
  705. else {
  706. $pub['Pages'] = $pages;
  707. }
  708. break;
  709. }
  710. }
  711. break;
  712. case '500': // series statements
  713. $pub['Notes'] = $value;
  714. break;
  715. case '504': // Bibliography, Etc. Note
  716. break;
  717. case '520': // Summary, etc
  718. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  719. foreach ($codes as $code => $value) {
  720. switch ($code) {
  721. case 'a':
  722. $pub['Abstract'] = $value;
  723. break;
  724. }
  725. }
  726. break;
  727. case '650': // Subject Added Entry-Topical Term
  728. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  729. foreach ($codes as $code => $value) {
  730. switch ($code) {
  731. case 'a':
  732. $pub['Keywords'][] = $value;
  733. break;
  734. }
  735. }
  736. break;
  737. case '653': // Index Term-Uncontrolled
  738. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  739. foreach ($codes as $code => $value) {
  740. switch ($code) {
  741. case 'a':
  742. $pub['Keywords'][] = $value;
  743. break;
  744. }
  745. }
  746. break;
  747. case '700': // Added Entry-Personal Name
  748. $author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
  749. $pub['Author List'][] = $author;
  750. break;
  751. case '710': // Added Entry-Corporate Name
  752. $author = array();
  753. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  754. foreach ($codes as $code => $value) {
  755. switch ($code) {
  756. case 'a': // Corporate name or jurisdiction name as entry elemen
  757. $author['Collective'] = $value;
  758. break;
  759. case 'b': // Subordinate unit
  760. $author['Collective'] .= ' ' . $value;
  761. break;
  762. }
  763. }
  764. $pub['Author List'][] = $author;
  765. break;
  766. case '773': // host item entry
  767. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  768. foreach ($codes as $code => $value) {
  769. switch ($code) {
  770. case 'a':
  771. if (preg_match('/Proceedings/i', $value)) {
  772. $pub['Series Name'] = preg_replace('/\.$/', '', $value);
  773. $pub['Publication Type'][0] = 'Conference Proceedings';
  774. }
  775. else {
  776. $pub['Journal Name'] = preg_replace('/\.$/', '', $value);
  777. }
  778. break;
  779. case 't':
  780. if (preg_match('/Proceedings/i', $value)) {
  781. $pub['Series Name'] = preg_replace('/\.$/', '', $value);
  782. $pub['Publication Type'][0] = 'Conference Proceedings';
  783. }
  784. $pub['Journal Name'] = preg_replace('/\.$/', '', $value);
  785. break;
  786. case 'g':
  787. $matches = array();
  788. if (preg_match('/^(\d\d\d\d)/', $value, $matches)) {
  789. $pub['Publication Date'] = $matches[1];
  790. }
  791. elseif (preg_match('/(.*?)(\.|\s+)\s*(\d+),\s(\d\d\d\d)/', $value, $matches)) {
  792. $year = $matches[4];
  793. $month = $matches[1];
  794. $day = $matches[3];
  795. $pub['Publication Date'] = "$year $month $day";
  796. }
  797. elseif (preg_match('/\((.*?)(\.|\s+)(\d\d\d\d)\)/', $value, $matches)) {
  798. $year = $matches[3];
  799. $month = $matches[1];
  800. $pub['Publication Date'] = "$year $month";
  801. }
  802. elseif (preg_match('/^(.*?) (\d\d\d\d)/', $value, $matches)) {
  803. $year = $matches[2];
  804. $month = $matches[1];
  805. $pub['Publication Date'] = "$year $month";
  806. }
  807. if (preg_match('/v\. (.*?)(,|\s+)/', $value, $matches)) {
  808. $pub['Volume'] = $matches[1];
  809. }
  810. if (preg_match('/v\. (.*?)(,|\s+)\((.*?)\)/', $value, $matches)) {
  811. $pub['Volume'] = $matches[1];
  812. $pub['Issue'] = $matches[3];
  813. }
  814. if (preg_match('/no\. (.*?)(\s|$)/', $value, $matches)) {
  815. $pub['Issue'] = $matches[1];
  816. }
  817. break;
  818. case 'p':
  819. $pub['Journal Abbreviation'] = $value;
  820. break;
  821. case 'z':
  822. $pub['ISBN'] = $value;
  823. break;
  824. }
  825. }
  826. break;
  827. case '852': // Location (Where is the publication held)
  828. break;
  829. case '856': // Electronic Location and Access
  830. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  831. foreach ($codes as $code => $value) {
  832. switch ($code) {
  833. case 'u':
  834. $pub['URL'] = $value;
  835. break;
  836. }
  837. }
  838. break;
  839. default:
  840. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  841. $unhandled[$tag][] = $codes;
  842. break;
  843. }
  844. }
  845. }
  846. //dpm($unhandled);
  847. // build the Dbxref
  848. if ($pub['Publication Database'] != 'AGL') {
  849. }
  850. if ($pub['Publication Accession'] and $pub['Publication Database']) {
  851. $pub['Publication Dbxref'] = $pub['Publication Database'] . ":" . $pub['Publication Accession'];
  852. unset($pub['Publication Accession']);
  853. unset($pub['Publication Database']);
  854. }
  855. // build the full authors list
  856. if (is_array($pub['Author List'])) {
  857. $authors = '';
  858. foreach ($pub['Author List'] as $author) {
  859. if (array_key_exists('valid', $author) and $author['valid'] == 'N') {
  860. // skip non-valid entries. A non-valid entry should have
  861. // a corresponding corrected entry so we can saftely skip it.
  862. continue;
  863. }
  864. if (array_key_exists('Collective', $author)) {
  865. $authors .= $author['Collective'] . ', ';
  866. }
  867. else {
  868. if (array_key_exists('Surname', $author)) {
  869. $authors .= $author['Surname'];
  870. if(array_key_exists('First Initials', $author)) {
  871. $authors .= ' ' . $author['First Initials'];
  872. }
  873. $authors .= ', ';
  874. }
  875. }
  876. }
  877. $authors = substr($authors, 0, -2);
  878. $pub['Authors'] = $authors;
  879. }
  880. else {
  881. $pub['Authors'] = $pub['Author List'];
  882. }
  883. // for Title, Abstract, Authors, convert the html entity and remove special
  884. // unicode chars that are not meant for display
  885. $pub['Title'] = preg_replace( '/[\p{So}]/u', '', mb_convert_encoding($pub['Title'], 'UTF-8', 'HTML-ENTITIES'));
  886. if (key_exists('Abstract', $pub)) {
  887. $pub['Abstract'] = preg_replace( '/[\p{So}]/u', '', mb_convert_encoding($pub['Abstract'], 'UTF-8', 'HTML-ENTITIES'));
  888. }
  889. $newauths = array();
  890. foreach ($pub['Author List'] AS $auth) {
  891. foreach($auth AS $k => $v) {
  892. $auth[$k] = preg_replace( '/[\p{So}]/u', '', mb_convert_encoding($v, 'UTF-8', 'HTML-ENTITIES'));
  893. }
  894. array_push($newauths, $auth);
  895. }
  896. $pub['Author List'] = $newauths;
  897. // build the citation
  898. $pub['Citation'] = tripal_pub_create_citation($pub);
  899. $pub['raw'] = $pub_xml;
  900. return $pub;
  901. }
  902. /**
  903. * Used for parsing of the XML results to get a set of subfields
  904. *
  905. * @param $xml
  906. * The XMl object to read
  907. * @return
  908. * An array of codes and their values
  909. *
  910. * @ingroup tripal_pub
  911. */
  912. function tripal_pub_remote_search_AGL_get_subfield($xml) {
  913. $codes = array();
  914. while ($xml->read()) {
  915. $sub_element = $xml->name;
  916. // when we've reached the end of the datafield element then break out of the while loop
  917. if ($xml->nodeType == XMLReader::END_ELEMENT and $sub_element == 'datafield') {
  918. return $codes;
  919. }
  920. // if inside the subfield element then get the code
  921. if ($xml->nodeType == XMLReader::ELEMENT and $sub_element == 'subfield') {
  922. $code = $xml->getAttribute('code');
  923. $xml->read();
  924. $value = $xml->value;
  925. $codes[$code] = $value;
  926. }
  927. }
  928. return $codes;
  929. }
  930. /**
  931. * Used for parsing of the XML results to get details about an author
  932. *
  933. * @param $xml
  934. * The XML object to read
  935. * @param $ind1
  936. * Indicates how an author record is stored; 0 means given name is first
  937. * 1 means surname is first, 3 means a family name is given
  938. *
  939. * @return
  940. *
  941. *
  942. * @ingroup tripal_pub
  943. */
  944. function tripal_pub_remote_search_AGL_get_author($xml, $ind1) {
  945. $author = array();
  946. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  947. foreach ($codes as $code => $value) {
  948. switch ($code) {
  949. case 'a':
  950. // remove any trailing commas
  951. $value = preg_replace('/,$/', '', $value);
  952. if ($ind1 == 0) { // Given Name is first
  953. $author['Given Name'] = $names[0];
  954. }
  955. if ($ind1 == 1) { // Surname is first
  956. // split the parts of the name using a comma
  957. $names = explode(',', $value);
  958. $author['Surname'] = $names[0];
  959. $author['Given Name'] = '';
  960. unset($names[0]);
  961. foreach($names as $index => $name) {
  962. $author['Given Name'] .= $name . ' ';
  963. }
  964. $first_names = explode(' ', $author['Given Name']);
  965. $author['First Initials'] = '';
  966. foreach ($first_names as $index => $name) {
  967. $author['First Initials'] .= substr($name, 0, 1);
  968. }
  969. }
  970. if ($ind1 == 3) { // A family name
  971. }
  972. break;
  973. }
  974. }
  975. return $author;
  976. }