tripal_chado.pub_importer_AGL.inc 33 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022
  1. <?php
  2. /**
  3. * @file
  4. * This file provides support for importing and parsing of results from the
  5. * USDA National Agricultural Library (AGL) database. The functions here are
  6. * used by both the publication importer setup form and the publication
  7. * importer. The USDA AGL database uses a YAZ protocol for querying and
  8. * retrieving records.
  9. *
  10. */
  11. /**
  12. * A hook for altering the publication importer form. It Changes the
  13. * 'Days' element to 'Year' and removes the 'Journal Name' filter.
  14. *
  15. * @param $form
  16. * The Drupal form array
  17. * @param $form_state
  18. * The form state array
  19. * @param $num_criteria
  20. * The number of criteria the user currently has added to the form
  21. *
  22. * @return
  23. * The form (drupal form api)
  24. *
  25. * @ingroup tripal_pub
  26. */
  27. function tripal_pub_remote_alter_form_AGL($form, $form_state, $num_criteria = 1) {
  28. // So far we haven't been able to get AGL to filter results to only
  29. // include pubs by the XX number days in the past. So, we will
  30. // change the 'days' element to be the year to query
  31. $form['themed_element']['days']['#title'] = t('Year');
  32. $form['themed_element']['days']['#description'] = t('Please enter a year to limit records by the year they were published, created or modified in the database.');
  33. // The Journal Name filter doesn't seem to work, so remove it
  34. for ($i = 1; $i <= $num_criteria; $i++) {
  35. unset($form['themed_element']['criteria'][$i]["scope-$i"]['#options']['journal']);
  36. }
  37. return $form;
  38. }
  39. /**
  40. * A hook for providing additional validation of importer setup form.
  41. *
  42. * @param $form
  43. * The Drupal form array
  44. * @param $form_state
  45. * The form state array
  46. *
  47. * @return
  48. * The form (drupal form api)
  49. *
  50. * @ingroup tripal_pub
  51. */
  52. function tripal_pub_remote_validate_form_AGL($form, $form_state) {
  53. $days = trim($form_state['values']["days"]);
  54. $num_criteria = $form_state['values']['num_criteria'];
  55. if ($days and !preg_match('/^\d\d\d\d$/', $days)) {
  56. form_set_error("days", "Please enter a four digit year.");
  57. }
  58. $num_ids = 0;
  59. for ($i = 1; $i <= $num_criteria; $i++) {
  60. $search_terms = trim($form_state['values']["search_terms-$i"]);
  61. $scope = $form_state['values']["scope-$i"];
  62. if ($scope == 'id' and !preg_match('/^AGL:\d+$/', $search_terms)) {
  63. form_set_error("search_terms-$i", "The AGL accession be a numeric value, prefixed with 'AGL:' (e.g. AGL:3890740).");
  64. }
  65. if ($scope == 'id') {
  66. $num_ids++;
  67. }
  68. if ($num_ids > 1) {
  69. form_set_error("search_terms-$i", "Unfortuantely, the AGL importer can only support a single accession at a time. Please remove the others.");
  70. }
  71. }
  72. return $form;
  73. }
  74. /**
  75. * A hook for performing the search on the AGL database.
  76. *
  77. * @param $search_array
  78. * An array containing the serach criteria for the serach
  79. * @param $num_to_retrieve
  80. * Indicates the maximum number of publications to retrieve from the remote
  81. * database
  82. * @param $page
  83. * Indicates the page to retrieve. This corresponds to a paged table, where
  84. * each page has $num_to_retrieve publications.
  85. *
  86. * @return
  87. * An array of publications.
  88. *
  89. * @ingroup tripal_pub
  90. */
  91. function tripal_pub_remote_search_AGL($search_array, $num_to_retrieve, $page) {
  92. // get some values from the serach array
  93. $num_criteria = $search_array['num_criteria'];
  94. $days = array_key_exists('days', $search_array) ? $search_array['days'] : '';
  95. // set some defaults
  96. $search_array['limit'] = $num_to_retrieve;
  97. // To build the CCL search string we want to have a single entry for 'author', 'title', 'abstract'
  98. // or 'id', and also the corresponding 'not for each of those.
  99. // But the search form allows the user to have multiple rows of the same type. So, we will build the
  100. // search string separately for each category and it's negative category (if NOT is selected as the op)
  101. // and at the end we will put them together into a single search string. We need to keep
  102. // track of the first entry of any category because it will not have an op (e.g. 'or' or 'and') but the
  103. // operation will be pushed out to separate the categories. The op for any second or third instance of
  104. // the same category will be included within the search string for the catgory.
  105. $ccl = '';
  106. $title = '';
  107. $author = '';
  108. $abstract = '';
  109. $id = '';
  110. $any = '';
  111. $negate_title = '';
  112. $negate_author = '';
  113. $negate_abstract = '';
  114. $negate_id = '';
  115. $negate_any = '';
  116. $order = [];
  117. $first_abstract = 1;
  118. $first_author = 1;
  119. $first_title = 1;
  120. $first_id = 1;
  121. $first_any = 1;
  122. $first_negate_abstract = 1;
  123. $first_negate_author = 1;
  124. $first_negate_title = 1;
  125. $first_negate_id = 1;
  126. $first_negate_any = 1;
  127. for ($i = 1; $i <= $num_criteria; $i++) {
  128. $search_terms = trim($search_array['criteria'][$i]['search_terms']);
  129. $scope = $search_array['criteria'][$i]['scope'];
  130. $is_phrase = $search_array['criteria'][$i]['is_phrase'];
  131. $op = $search_array['criteria'][$i]['operation'];
  132. if ($op) {
  133. $op = strtolower($op);
  134. }
  135. $search_terms = trim($search_terms);
  136. // if this is not a phrase then make sure the AND and OR are lower-case
  137. if (!$is_phrase) {
  138. $search_terms = preg_replace('/ OR /', ' or ', $search_terms);
  139. $search_terms = preg_replace('/ AND /', ' and ', $search_terms);
  140. }
  141. // else make sure the search terms are surrounded by quotes
  142. else {
  143. $search_terms = "\"$search_terms\"";
  144. }
  145. // if this is a 'not' operation then we want to change it to an
  146. // and
  147. $negate = '';
  148. if ($op == 'not') {
  149. $scope = "negate_$scope";
  150. $op = 'or';
  151. }
  152. $order[] = ['scope' => $scope, 'op' => $op];
  153. // build each category
  154. if ($scope == 'title') {
  155. if ($first_title) {
  156. $title .= "($search_terms) ";
  157. $first_title = 0;
  158. }
  159. else {
  160. $title .= "$op ($search_terms) ";
  161. }
  162. }
  163. if ($scope == 'negate_title') {
  164. if ($first_negate_title) {
  165. $negate_title .= "($search_terms) ";
  166. $first_negate_title = 0;
  167. }
  168. else {
  169. $negate_title .= "$op ($search_terms) ";
  170. }
  171. }
  172. elseif ($scope == 'author') {
  173. if ($first_author) {
  174. $author .= "($search_terms) ";
  175. $first_author = 0;
  176. }
  177. else {
  178. $author .= "$op ($search_terms) ";
  179. }
  180. }
  181. elseif ($scope == 'negate_author') {
  182. if ($first_negate_author) {
  183. $negate_author .= "($search_terms) ";
  184. $first_negate_author = 0;
  185. }
  186. else {
  187. $negate_author .= "$op ($search_terms) ";
  188. }
  189. }
  190. elseif ($scope == 'abstract') {
  191. if ($first_abstract) {
  192. $abstract .= "($search_terms) ";
  193. $first_abstract = 0;
  194. }
  195. else {
  196. $abstract .= "$op ($search_terms) ";
  197. }
  198. }
  199. elseif ($scope == 'negate_abstract') {
  200. if ($first_negate_abstract) {
  201. $negate_abstract .= "($search_terms) ";
  202. $first_negate_abstract = 0;
  203. }
  204. else {
  205. $negate_abstract .= "$op ($search_terms) ";
  206. }
  207. }
  208. elseif ($scope == 'journal') {
  209. if ($first_journal) {
  210. $journal .= "($search_terms) ";
  211. $first_jounral = 0;
  212. }
  213. else {
  214. $journal .= "$op ($search_terms) ";
  215. }
  216. }
  217. elseif ($scope == 'negate_journal') {
  218. if ($first_negate_journal) {
  219. $negate_journal .= "($search_terms) ";
  220. $first_negate_journal = 0;
  221. }
  222. else {
  223. $negate_journal .= "$op ($search_terms) ";
  224. }
  225. }
  226. elseif ($scope == 'id') {
  227. if ($first_id) {
  228. $id .= "(" . preg_replace('/AGL:([^\s]*)/', '$1', $search_terms) . ") ";
  229. $first_id = 0;
  230. }
  231. else {
  232. $id .= "$op (" . preg_replace('/AGL:([^\s]*)/', '$1', $search_terms) . ") ";
  233. }
  234. }
  235. elseif ($scope == 'negate_id') {
  236. if ($first_negate_id) {
  237. $negate_id .= "(" . preg_replace('/AGL:([^\s]*)/', '$1', $search_terms) . ") ";
  238. $first_negate_id = 0;
  239. }
  240. else {
  241. $negate_id .= "$op (" . preg_replace('/AGL:([^\s]*)/', '$1', $search_terms) . ") ";
  242. }
  243. }
  244. elseif ($scope == 'any') {
  245. if ($first_any) {
  246. $any .= "($search_terms) ";
  247. $first_any = 0;
  248. }
  249. else {
  250. $any .= "$op ($search_terms) ";
  251. }
  252. }
  253. elseif ($scope == 'negate_any') {
  254. if ($first_negate_any) {
  255. $negate_any .= "($search_terms) ";
  256. $first_any = 0;
  257. }
  258. else {
  259. $negate_any .= "$op ($search_terms) ";
  260. }
  261. }
  262. }
  263. // now build the CCL string in order
  264. $abstract_done = 0;
  265. $author_done = 0;
  266. $journal_done = 0;
  267. $title_done = 0;
  268. $id_done = 0;
  269. $any_done = 0;
  270. $negate_abstract_done = 0;
  271. $negate_journal_done = 0;
  272. $negate_author_done = 0;
  273. $negate_title_done = 0;
  274. $negate_id_done = 0;
  275. $negate_any_done = 0;
  276. for ($i = 0; $i < count($order); $i++) {
  277. if ($order[$i]['scope'] == 'abstract' and !$abstract_done) {
  278. $op = $order[$i]['op'];
  279. $ccl .= "$op abstract=($abstract) ";
  280. $abstract_done = 1;
  281. }
  282. if ($order[$i]['scope'] == 'negate_abstract' and !$negate_abstract_done) {
  283. $ccl .= "not abstract=($negate_abstract) ";
  284. $negate_abstract_done = 1;
  285. }
  286. if ($order[$i]['scope'] == 'author' and !$author_done) {
  287. $op = $order[$i]['op'];
  288. $ccl .= "$op author=($author) ";
  289. $author_done = 1;
  290. }
  291. if ($order[$i]['scope'] == 'negate_author' and !$negate_author_done) {
  292. $ccl .= "not author=($negate_author) ";
  293. $negate_author_done = 1;
  294. }
  295. if ($order[$i]['scope'] == 'journal' and !$journal_done) {
  296. $op = $order[$i]['op'];
  297. $ccl .= "$op journal=($journal) ";
  298. $journal_done = 1;
  299. }
  300. if ($order[$i]['scope'] == 'negate_journal' and !$negate_journal_done) {
  301. $ccl .= "not author=($negate_journal) ";
  302. $negate_journal_done = 1;
  303. }
  304. if ($order[$i]['scope'] == 'id' and !$id_done) {
  305. $op = $order[$i]['op'];
  306. $ccl .= "$op id=($id) ";
  307. $id_done = 1;
  308. }
  309. if ($order[$i]['scope'] == 'negate_id' and !$negate_id_done) {
  310. $ccl .= "not id=($negate_id) ";
  311. $negate_id_done = 1;
  312. }
  313. if ($order[$i]['scope'] == 'title' and !$title_done) {
  314. $op = $order[$i]['op'];
  315. $ccl .= "$op title=($title) ";
  316. $title_done = 1;
  317. }
  318. if ($order[$i]['scope'] == 'negate_title' and !$negate_title_done) {
  319. $ccl .= "not title=($negate_title) ";
  320. $negate_title_done = 1;
  321. }
  322. if ($order[$i]['scope'] == 'any' and !$any_done) {
  323. $op = $order[$i]['op'];
  324. $ccl .= "$op ($any) ";
  325. $any_done = 1;
  326. }
  327. if ($order[$i]['scope'] == 'negate_any' and !$negate_any_done) {
  328. $ccl .= "not ($negate_any) ";
  329. $negate_any_done = 1;
  330. }
  331. }
  332. // for AGL the 'days' form element was converted to represent the year
  333. if ($days) {
  334. $ccl .= "and year=($days)";
  335. }
  336. // remove any preceeding 'and' or 'or'
  337. $ccl = preg_replace('/^\s*(and|or)/', '', $ccl);
  338. // yaz_connect() prepares for a connection to a Z39.50 server. This function is non-blocking
  339. // and does not attempt to establish a connection - it merely prepares a connect to be
  340. // performed later when yaz_wait() is called.
  341. // NAL Catalog
  342. //$yazc = yaz_connect('agricola.nal.usda.gov:7090/voyager');
  343. // NAL Article Citation Database
  344. $yazc = yaz_connect('agricola.nal.usda.gov:7190/voyager');
  345. // use the USMARC record type. But OPAC is also supported by Agricola
  346. yaz_syntax($yazc, "usmarc");
  347. // the search query is built using CCL, we need to first
  348. // configure it so it can map the attributes to defined identifiers
  349. // The attribute set used by AGL can be found at the bottom of this page:
  350. // http://agricola.nal.usda.gov/help/z3950.html
  351. //
  352. // More in depth details: http://www.loc.gov/z3950/agency/bib1.html
  353. //
  354. // CCL Syntax: http://www.indexdata.com/yaz/doc/tools.html#CCL
  355. //
  356. $fields = [
  357. "title" => "u=4",
  358. "author" => "u=1003",
  359. "abstract" => "u=62",
  360. "id" => "u=12",
  361. "year" => "u=30 r=o",
  362. "journal" => "u=1033",
  363. ];
  364. yaz_ccl_conf($yazc, $fields);
  365. if (!yaz_ccl_parse($yazc, $ccl, $cclresult)) {
  366. drupal_set_message('Error parsing search string: ' . $cclresult["errorstring"], "error");
  367. watchdog('tpub_import', 'Error: %errstr', ['%errstr' => $cclresult["errorstring"]], WATCHDOG_ERROR);
  368. return [
  369. 'total_records' => 0,
  370. 'search_str' => '',
  371. 'pubs' => [],
  372. ];
  373. }
  374. $search_str = $cclresult["rpn"];
  375. // get the total number of records
  376. $total_records = tripal_pub_AGL_count($yazc, $search_str);
  377. // get the pubs in the specified rang
  378. $start = $page * $num_to_retrieve;
  379. $results = tripal_pub_AGL_range($yazc, $search_str, $start, $num_to_retrieve, $total_records);
  380. // close the connection
  381. yaz_close($yazc);
  382. return $results;
  383. }
  384. /**
  385. * Retrieves a range of publications from AGL
  386. *
  387. * @param $yazc
  388. * The YAZC connection object.
  389. * @param $search_str
  390. * The search string to use for searching.
  391. * @param $start
  392. * The start of the range
  393. * @param $num_to_retrieve
  394. * The number of publications to retrieve
  395. * @param $total_records
  396. * The total number of records in the dataset. This value should have
  397. * been retrieved by tripal_pub_AGL_count() function.
  398. *
  399. * @return
  400. * An array containing the total_records in the dataaset, the search string
  401. * and an array of the publications that were retreived.
  402. *
  403. * @ingroup tripal_pub
  404. */
  405. function tripal_pub_AGL_range($yazc, $search_str, $start, $num_to_retrieve, $total_records) {
  406. yaz_range($yazc, 1, $total_records);
  407. if (!yaz_present($yazc)) {
  408. $error_no = yaz_errno($yazc);
  409. $error_msg = yaz_error($yazc);
  410. $additional = yaz_addinfo($yazc);
  411. if ($additional != $error_msg) {
  412. $error_msg .= " $additional";
  413. }
  414. drupal_set_message("ERROR waiting on search at AGL: ($error_no) $error_msg", "error");
  415. watchdog('tpub_import', "ERROR waiting on search at AGL: (%error_no) %error_msg",
  416. ['%error_no' => $error_no, '%error_msg' => $error_msg], WATCHDOG_ERROR);
  417. return [
  418. 'total_records' => 0,
  419. 'search_str' => $search_str,
  420. 'pubs' => [],
  421. ];
  422. }
  423. if ($start + $num_to_retrieve > $total_records) {
  424. $num_to_retrieve = $total_records - $start;
  425. }
  426. $pubs = [];
  427. for ($i = $start; $i < $start + $num_to_retrieve; $i++) {
  428. // retrieve the XML results
  429. $pub_xml = yaz_record($yazc, $i + 1, 'xml; charset=marc-8,utf-8');
  430. if (!$pub_xml) {
  431. $error_no = yaz_errno($yazc);
  432. $error_msg = yaz_error($yazc);
  433. drupal_set_message("ERROR retrieving records from AGL: ($error_no) $error_msg", "error");
  434. watchdog('tpub_import', "ERROR retrieving records from AGL: (%error_no) %error_msg",
  435. ['%error_no' => $error_no, '%error_msg' => $error_msg], WATCHDOG_ERROR);
  436. return [
  437. 'total_records' => 0,
  438. 'search_str' => $search_str,
  439. 'pubs' => [],
  440. ];
  441. }
  442. // parse the pub XML
  443. $pub = tripal_pub_AGL_parse_pubxml($pub_xml);
  444. $pubs[] = $pub;
  445. }
  446. return [
  447. 'total_records' => $total_records,
  448. 'search_str' => $search_str,
  449. 'pubs' => $pubs,
  450. ];
  451. }
  452. /**
  453. * Retrieves the total number of publications that match the search string.
  454. *
  455. * @param $yazc
  456. * The YAZC connection object.
  457. * @param $search_str
  458. * The search string to use for searching.
  459. *
  460. * @return
  461. * a count of the total number of publications that match the search string
  462. *
  463. * @ingroup tripal_pub
  464. */
  465. function tripal_pub_AGL_count($yazc, $search_str) {
  466. //yaz_sort($yazc, "1=31 id"); // sort by publication date descending
  467. if (!yaz_search($yazc, "rpn", $search_str)) {
  468. $error_no = yaz_errno($yazc);
  469. $error_msg = yaz_error($yazc);
  470. $additional = yaz_addinfo($yazc);
  471. if ($additional != $error_msg) {
  472. $error_msg .= " $additional";
  473. }
  474. drupal_set_message("ERROR preparing search at AGL: ($error_no) $error_msg", "error");
  475. watchdog('tpub_import', "ERROR preparing search at AGL: (%error_no) %error_msg",
  476. ['%error_no' => $error_no, '%error_msg' => $error_msg], WATCHDOG_ERROR);
  477. return 0;
  478. }
  479. if (!yaz_wait()) {
  480. $error_no = yaz_errno($yazc);
  481. $error_msg = yaz_error($yazc);
  482. $additional = yaz_addinfo($yazc);
  483. if ($additional != $error_msg) {
  484. $error_msg .= " $additional";
  485. }
  486. drupal_set_message("ERROR waiting on search at AGL: ($error_no) $error_msg", "error");
  487. watchdog('tpub_import', "ERROR waiting on search at AGL: (%error_no) %error_msg",
  488. ['%error_no' => $error_no, '%error_msg' => $error_msg], WATCHDOG_ERROR);
  489. return 0;
  490. }
  491. // get the total number of results from the serach
  492. $count = yaz_hits($yazc);
  493. return $count;
  494. }
  495. /**
  496. * Parse publication XML for a single publication
  497. *
  498. * Description of XML format:
  499. * http://www.loc.gov/marc/bibliographic/bdsummary.html
  500. *
  501. * @param $pub_xml
  502. * A string containing the XML for a single publications
  503. *
  504. * @return
  505. * An array containing the details of the publication
  506. *
  507. * @ingroup tripal_pub
  508. */
  509. function tripal_pub_AGL_parse_pubxml($pub_xml) {
  510. $pub = [];
  511. // we will set the default publication type as a journal article. The NAL
  512. // dataset doesn't specify an article type so we'll have to glean the type
  513. // from other information (e.g. series name has 'Proceedings' in it)
  514. $pub['Publication Type'][0] = 'Journal Article';
  515. if (!$pub_xml) {
  516. return $pub;
  517. }
  518. // read the XML and iterate through it.
  519. $xml = new XMLReader();
  520. $xml->xml(trim($pub_xml));
  521. while ($xml->read()) {
  522. $element = $xml->name;
  523. if ($xml->nodeType == XMLReader::ELEMENT and $element == 'controlfield') {
  524. $tag = $xml->getAttribute('tag');
  525. $xml->read();
  526. $value = $xml->value;
  527. switch ($tag) {
  528. case '001': // control number
  529. $pub['Publication Accession'] = $value;
  530. break;
  531. case '003': // control number identifier
  532. break;
  533. case '005': // datea nd time of latest transaction
  534. break;
  535. case '006': // fixed-length data elemetns
  536. break;
  537. case '007': // physical description fixed field
  538. break;
  539. case '008': // fixed length data elements
  540. $month = [
  541. '01' => 'Jan',
  542. '02' => 'Feb',
  543. '03' => 'Mar',
  544. '04' => 'Apr',
  545. '05' => 'May',
  546. '06' => 'Jun',
  547. '07' => 'Jul',
  548. '08' => 'Aug',
  549. '09' => 'Sep',
  550. '10' => 'Oct',
  551. '11' => 'Nov',
  552. '12' => 'Dec',
  553. ];
  554. $date0 = substr($value, 0, 6); // date entered on file
  555. $date1 = substr($value, 7, 4); // year of publication
  556. $date2 = substr($value, 11, 4); // month of publication
  557. $place = substr($value, 15, 3);
  558. $lang = substr($value, 35, 3);
  559. if (preg_match('/\d\d\d\d/', $date1)) {
  560. $pub['Year'] = $date1;
  561. $pub['Publication Date'] = $date1;
  562. }
  563. if (preg_match('/\d\d/', $date2)) {
  564. $pub['Publication Date'] = $date1 . " " . $month[substr($date2, 0, 2)] . " " . substr($date2, 3, 2);
  565. }
  566. if (!preg_match('/\s+/', $place)) {
  567. $pub['Published Location'] = $place;
  568. }
  569. if (!preg_match('/\s+/', $lang)) {
  570. $pub['Language Abbr'] = $lang;
  571. }
  572. break;
  573. default: // unhandled tag
  574. break;
  575. }
  576. }
  577. elseif ($xml->nodeType == XMLReader::ELEMENT and $element == 'datafield') {
  578. $tag = $xml->getAttribute('tag');
  579. $ind1 = $xml->getAttribute('ind1');
  580. $ind2 = $xml->getAttribute('ind2');
  581. switch ($tag) {
  582. case '16': // National Bibliographic Agency Control Number
  583. break;
  584. case '35': // System Control Number
  585. $author = [];
  586. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  587. foreach ($codes as $code => $value) {
  588. switch ($code) {
  589. case 'a': // System control number
  590. $pub['Publication Accession'] = $value;
  591. break;
  592. }
  593. }
  594. case '40': // Cataloging Source (NR)
  595. $author = [];
  596. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  597. foreach ($codes as $code => $value) {
  598. switch ($code) {
  599. case 'a': // original cataolging agency
  600. $pub['Publication Database'] = $value;
  601. break;
  602. }
  603. }
  604. break;
  605. case '72': // Subject Category Code
  606. break;
  607. case '100': // main entry-personal name
  608. $author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
  609. $pub['Author List'][] = $author;
  610. break;
  611. case '110': // main entry-corporate nmae
  612. $author = [];
  613. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  614. foreach ($codes as $code => $value) {
  615. switch ($code) {
  616. case 'a': // Corporate name or jurisdiction name as entry elemen
  617. $author['Collective'] = $value;
  618. break;
  619. case 'b': // Subordinate unit
  620. $author['Collective'] .= ' ' . $value;
  621. break;
  622. }
  623. }
  624. $pub['Author List'][] = $author;
  625. break;
  626. case '111': // main entry-meeting name
  627. break;
  628. case '130': // main entry-uniform title
  629. break;
  630. case '210': // abbreviated title
  631. break;
  632. case '222': // key title
  633. break;
  634. case '240': // uniform title
  635. break;
  636. case '242': // translation of title by cataloging agency
  637. break;
  638. case '243': // collective uniform title
  639. break;
  640. case '245': // title statement
  641. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  642. foreach ($codes as $code => $value) {
  643. switch ($code) {
  644. case 'a':
  645. $pub['Title'] = trim(preg_replace('/\.$/', '', $value));
  646. break;
  647. case 'b':
  648. $pub['Title'] .= ' ' . $value;
  649. break;
  650. case 'h':
  651. $pub['Publication Model'] = $value;
  652. break;
  653. }
  654. }
  655. break;
  656. case '246': // varying form of title
  657. break;
  658. case '247': // former title
  659. break;
  660. case '250': // edition statement
  661. break;
  662. case '254': // musicla presentation statement
  663. break;
  664. case '255': // cartographic mathematical data
  665. break;
  666. case '256': // computer file characteristics
  667. break;
  668. case '257': // country of producing entity
  669. break;
  670. case '258': // philatelic issue data
  671. break;
  672. case '260': // publication, distribution ,etc (imprint)
  673. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  674. foreach ($codes as $code => $value) {
  675. switch ($code) {
  676. case 'a':
  677. $pub['Published Location'] = $value;
  678. break;
  679. case 'b':
  680. $pub['Publisher'] = $value;
  681. break;
  682. case 'c':
  683. $pub['Publication Date'] = $value;
  684. break;
  685. }
  686. }
  687. break;
  688. case '263': // projected publication date
  689. break;
  690. case '264': // production, publication, distribution, manufacture and copyright notice
  691. break;
  692. case '270': // Address
  693. break;
  694. case '300': // Address
  695. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  696. foreach ($codes as $code => $value) {
  697. switch ($code) {
  698. case 'a':
  699. $pages = $value;
  700. $pages = preg_replace('/^p\. /', '', $pages);
  701. $pages = preg_replace('/\.$/', '', $pages);
  702. if (preg_match('/p$/', $pages)) {
  703. // skip this, it's the number of pages not the page numbers
  704. }
  705. else {
  706. $pub['Pages'] = $pages;
  707. }
  708. break;
  709. }
  710. }
  711. break;
  712. case '500': // series statements
  713. $pub['Notes'] = $value;
  714. break;
  715. case '504': // Bibliography, Etc. Note
  716. break;
  717. case '520': // Summary, etc
  718. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  719. foreach ($codes as $code => $value) {
  720. switch ($code) {
  721. case 'a':
  722. $pub['Abstract'] = $value;
  723. break;
  724. }
  725. }
  726. break;
  727. case '650': // Subject Added Entry-Topical Term
  728. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  729. foreach ($codes as $code => $value) {
  730. switch ($code) {
  731. case 'a':
  732. $pub['Keywords'][] = $value;
  733. break;
  734. }
  735. }
  736. break;
  737. case '653': // Index Term-Uncontrolled
  738. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  739. foreach ($codes as $code => $value) {
  740. switch ($code) {
  741. case 'a':
  742. $pub['Keywords'][] = $value;
  743. break;
  744. }
  745. }
  746. break;
  747. case '700': // Added Entry-Personal Name
  748. $author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
  749. $pub['Author List'][] = $author;
  750. break;
  751. case '710': // Added Entry-Corporate Name
  752. $author = [];
  753. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  754. foreach ($codes as $code => $value) {
  755. switch ($code) {
  756. case 'a': // Corporate name or jurisdiction name as entry elemen
  757. $author['Collective'] = $value;
  758. break;
  759. case 'b': // Subordinate unit
  760. $author['Collective'] .= ' ' . $value;
  761. break;
  762. }
  763. }
  764. $pub['Author List'][] = $author;
  765. break;
  766. case '773': // host item entry
  767. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  768. foreach ($codes as $code => $value) {
  769. switch ($code) {
  770. case 'a':
  771. if (preg_match('/Proceedings/i', $value)) {
  772. $pub['Series Name'] = preg_replace('/\.$/', '', $value);
  773. $pub['Publication Type'][0] = 'Conference Proceedings';
  774. }
  775. else {
  776. $pub['Journal Name'] = preg_replace('/\.$/', '', $value);
  777. }
  778. break;
  779. case 't':
  780. if (preg_match('/Proceedings/i', $value)) {
  781. $pub['Series Name'] = preg_replace('/\.$/', '', $value);
  782. $pub['Publication Type'][0] = 'Conference Proceedings';
  783. }
  784. $pub['Journal Name'] = preg_replace('/\.$/', '', $value);
  785. break;
  786. case 'g':
  787. $matches = [];
  788. if (preg_match('/^(\d\d\d\d)/', $value, $matches)) {
  789. $pub['Publication Date'] = $matches[1];
  790. }
  791. elseif (preg_match('/(.*?)(\.|\s+)\s*(\d+),\s(\d\d\d\d)/', $value, $matches)) {
  792. $year = $matches[4];
  793. $month = $matches[1];
  794. $day = $matches[3];
  795. $pub['Publication Date'] = "$year $month $day";
  796. }
  797. elseif (preg_match('/\((.*?)(\.|\s+)(\d\d\d\d)\)/', $value, $matches)) {
  798. $year = $matches[3];
  799. $month = $matches[1];
  800. $pub['Publication Date'] = "$year $month";
  801. }
  802. elseif (preg_match('/^(.*?) (\d\d\d\d)/', $value, $matches)) {
  803. $year = $matches[2];
  804. $month = $matches[1];
  805. $pub['Publication Date'] = "$year $month";
  806. }
  807. if (preg_match('/v\. (.*?)(,|\s+)/', $value, $matches)) {
  808. $pub['Volume'] = $matches[1];
  809. }
  810. if (preg_match('/v\. (.*?)(,|\s+)\((.*?)\)/', $value, $matches)) {
  811. $pub['Volume'] = $matches[1];
  812. $pub['Issue'] = $matches[3];
  813. }
  814. if (preg_match('/no\. (.*?)(\s|$)/', $value, $matches)) {
  815. $pub['Issue'] = $matches[1];
  816. }
  817. break;
  818. case 'p':
  819. $pub['Journal Abbreviation'] = $value;
  820. break;
  821. case 'z':
  822. $pub['ISBN'] = $value;
  823. break;
  824. }
  825. }
  826. break;
  827. case '852': // Location (Where is the publication held)
  828. break;
  829. case '856': // Electronic Location and Access
  830. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  831. foreach ($codes as $code => $value) {
  832. switch ($code) {
  833. case 'u':
  834. $pub['URL'] = $value;
  835. break;
  836. }
  837. }
  838. break;
  839. default:
  840. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  841. $unhandled[$tag][] = $codes;
  842. break;
  843. }
  844. }
  845. }
  846. // build the Dbxref
  847. if ($pub['Publication Database'] != 'AGL') {
  848. }
  849. if ($pub['Publication Accession'] and $pub['Publication Database']) {
  850. $pub['Publication Dbxref'] = $pub['Publication Database'] . ":" . $pub['Publication Accession'];
  851. unset($pub['Publication Accession']);
  852. unset($pub['Publication Database']);
  853. }
  854. // build the full authors list
  855. if (is_array($pub['Author List'])) {
  856. $authors = '';
  857. foreach ($pub['Author List'] as $author) {
  858. if (array_key_exists('valid', $author) and $author['valid'] == 'N') {
  859. // skip non-valid entries. A non-valid entry should have
  860. // a corresponding corrected entry so we can saftely skip it.
  861. continue;
  862. }
  863. if (array_key_exists('Collective', $author)) {
  864. $authors .= $author['Collective'] . ', ';
  865. }
  866. else {
  867. if (array_key_exists('Surname', $author)) {
  868. $authors .= $author['Surname'];
  869. if (array_key_exists('First Initials', $author)) {
  870. $authors .= ' ' . $author['First Initials'];
  871. }
  872. $authors .= ', ';
  873. }
  874. }
  875. }
  876. $authors = substr($authors, 0, -2);
  877. $pub['Authors'] = $authors;
  878. }
  879. else {
  880. $pub['Authors'] = $pub['Author List'];
  881. }
  882. // for Title, Abstract, Authors, convert the html entity and remove special unicode chars that are not meant for display
  883. $pub['Title'] = preg_replace('/[\p{So}]/u', '', mb_convert_encoding($pub['Title'], 'UTF-8', 'HTML-ENTITIES'));
  884. if (key_exists('Abstract', $pub)) {
  885. $pub['Abstract'] = preg_replace('/[\p{So}]/u', '', mb_convert_encoding($pub['Abstract'], 'UTF-8', 'HTML-ENTITIES'));
  886. }
  887. $newauths = [];
  888. foreach ($pub['Author List'] AS $auth) {
  889. foreach ($auth AS $k => $v) {
  890. $auth[$k] = preg_replace('/[\p{So}]/u', '', mb_convert_encoding($v, 'UTF-8', 'HTML-ENTITIES'));
  891. }
  892. array_push($newauths, $auth);
  893. }
  894. $pub['Author List'] = $newauths;
  895. // build the citation
  896. $pub['Citation'] = chado_pub_create_citation($pub);
  897. $pub['raw'] = $pub_xml;
  898. return $pub;
  899. }
  900. /**
  901. * Used for parsing of the XML results to get a set of subfields
  902. *
  903. * @param $xml
  904. * The XMl object to read
  905. *
  906. * @return
  907. * An array of codes and their values
  908. *
  909. * @ingroup tripal_pub
  910. */
  911. function tripal_pub_remote_search_AGL_get_subfield($xml) {
  912. $codes = [];
  913. while ($xml->read()) {
  914. $sub_element = $xml->name;
  915. // when we've reached the end of the datafield element then break out of the while loop
  916. if ($xml->nodeType == XMLReader::END_ELEMENT and $sub_element == 'datafield') {
  917. return $codes;
  918. }
  919. // if inside the subfield element then get the code
  920. if ($xml->nodeType == XMLReader::ELEMENT and $sub_element == 'subfield') {
  921. $code = $xml->getAttribute('code');
  922. $xml->read();
  923. $value = $xml->value;
  924. $codes[$code] = $value;
  925. }
  926. }
  927. return $codes;
  928. }
  929. /**
  930. * Used for parsing of the XML results to get details about an author
  931. *
  932. * @param $xml
  933. * The XML object to read
  934. * @param $ind1
  935. * Indicates how an author record is stored; 0 means given name is first
  936. * 1 means surname is first, 3 means a family name is given
  937. *
  938. * @return
  939. *
  940. *
  941. * @ingroup tripal_pub
  942. */
  943. function tripal_pub_remote_search_AGL_get_author($xml, $ind1) {
  944. $author = [];
  945. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  946. foreach ($codes as $code => $value) {
  947. switch ($code) {
  948. case 'a':
  949. // remove any trailing commas
  950. $value = preg_replace('/,$/', '', $value);
  951. if ($ind1 == 0) { // Given Name is first
  952. $author['Given Name'] = $names[0];
  953. }
  954. if ($ind1 == 1) { // Surname is first
  955. // split the parts of the name using a comma
  956. $names = explode(',', $value);
  957. $author['Surname'] = $names[0];
  958. $author['Given Name'] = '';
  959. unset($names[0]);
  960. foreach ($names as $index => $name) {
  961. $author['Given Name'] .= $name . ' ';
  962. }
  963. $first_names = explode(' ', $author['Given Name']);
  964. $author['First Initials'] = '';
  965. foreach ($first_names as $index => $name) {
  966. $author['First Initials'] .= substr($name, 0, 1);
  967. }
  968. }
  969. if ($ind1 == 3) { // A family name
  970. }
  971. break;
  972. }
  973. }
  974. return $author;
  975. }