AGL.inc 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624
  1. <?php
  2. /**
  3. * Installation:
  4. * 1) Install the yaz libraries: sudo apt-get install yaz libyaz3 libyaz3-dev
  5. * 2) Install the PHP module: sudo pecl install yaz
  6. * 3) Add "extension=yaz.so" to php.ini
  7. * 4) Restart apache
  8. *
  9. *
  10. */
  11. /**
  12. *
  13. */
  14. function tripal_pub_remote_search_AGL($search_array, $num_to_retrieve, $pager_id) {
  15. // get some values from the serach array
  16. $num_criteria = $search_array['num_criteria'];
  17. $days = $search_array['days'];
  18. // set some defaults
  19. $search_array['limit'] = $num_to_retrieve;
  20. // Build the query by iterating through the search array values
  21. $ccl = '';
  22. for ($i = 1; $i <= $num_criteria; $i++) {
  23. $search_terms = trim($search_array['criteria'][$i]['search_terms']);
  24. $scope = $search_array['criteria'][$i]['scope'];
  25. $is_phrase = $search_array['criteria'][$i]['is_phrase'];
  26. $op = $search_array['criteria'][$i]['operation'];
  27. if ($op) {
  28. $ccl .= " " . strtolower($op) . " ";
  29. }
  30. $search_terms = trim($search_terms);
  31. // if this is not a phrase then make sure the AND and OR are lower-case
  32. if (!$is_phrase) {
  33. $search_terms = preg_replace('/ OR /', ' or ', $search_terms);
  34. $search_terms = preg_replace('/ AND /', ' and ', $search_terms);
  35. }
  36. // else make sure the search terms are surrounded by quotes
  37. else {
  38. $search_terms = "\"$search_terms\"";
  39. }
  40. $ccl .= ' (';
  41. if ($scope == 'title') {
  42. $ccl .= "title=($search_terms)";
  43. }
  44. elseif ($scope == 'author') {
  45. $ccl .= "author=($search_terms)";
  46. }
  47. elseif ($scope == 'abstit') {
  48. $ccl .= "(title=($search_terms) or abstract=($search_terms))";
  49. }
  50. elseif ($scope == 'id') {
  51. $search_terms = preg_replace('/AGL:([^\s]*)/', 'id=($1)', $search_terms);
  52. $ccl .= $search_terms;
  53. }
  54. else {
  55. $ccl .= "$search_terms";
  56. }
  57. $ccl .= ') ';
  58. }
  59. if ($days) {
  60. // get the date of the day suggested
  61. $past_timestamp = time() - ($days * 86400);
  62. $past_date = getdate($past_timestamp);
  63. $ccl .= " and (date>=" . sprintf("%04d%02d%02d", $past_date['year'], $past_date['mon'], $past_date['mday']) . ")";
  64. }
  65. //$ccl = "(date=20110805220826.0)";
  66. // yaz_connect() prepares for a connection to a Z39.50 server. This function is non-blocking
  67. // and does not attempt to establish a connection - it merely prepares a connect to be
  68. // performed later when yaz_wait() is called.
  69. //$yazc = yaz_connect('agricola.nal.usda.gov:7090/voyager'); // NAL Catalog
  70. $yazc = yaz_connect('agricola.nal.usda.gov:7190/voyager'); // NAL Article Citation Database
  71. // use the USMARC record type. But OPAC is also supported by Agricola
  72. yaz_syntax($yazc, "usmarc");
  73. // the search query is built using CCL, we need to first
  74. // configure it so it can map the attributes to defined identifiers
  75. // The attribute set used by AGL can be found at the bottom of this page:
  76. // http://agricola.nal.usda.gov/help/z3950.html
  77. //
  78. // More in depth details: http://www.loc.gov/z3950/agency/bib1.html
  79. //
  80. // CCL Syntax: http://www.indexdata.com/yaz/doc/tools.html#CCL
  81. //
  82. $fields = array(
  83. "title" => "u=4",
  84. "author" => "u=1003",
  85. "abstract" => "u=62",
  86. "id" => "u=12",
  87. "date" => "u=1012 r=o p=3 s=100 ",
  88. );
  89. yaz_ccl_conf($yazc, $fields);
  90. //dpm($ccl);
  91. if (!yaz_ccl_parse($yazc, $ccl, &$cclresult)) {
  92. drupal_set_message('Error parsing search string: ' . $cclresult["errorstring"], "error");
  93. watchdog('tripal_pub', 'Error: %errstr', array('%errstr' => $cclresult["errorstring"]), WATCHDOG_ERROR);
  94. return array();
  95. }
  96. $search_str = $cclresult["rpn"];
  97. $search_array['search_string'] = $search_str;
  98. //dpm($search_array);
  99. // save the YAZ connection in the session for use by other functions
  100. $_SESSION['tripal_pub_AGL_query'][$search_str]['yaz_connection'] = $yazc;
  101. // we want to get the list of pubs using the search terms but using a Drupal style pager
  102. $pubs = tripal_pager_callback('tripal_pub_AGL_range', $num_to_retrieve, $pager_id,
  103. 'tripal_pub_AGL_count', $search_array);
  104. // close the connection
  105. unset($_SESSION['tripal_pub_AGL_query'][$search_str]['yaz_connection']);
  106. yaz_close($yazc);
  107. return $pubs;
  108. }
  109. /*
  110. * This function is used as the callback function when used with the
  111. * tripal_pager_callback function. This function returns a count of
  112. * the dataset to be paged.
  113. */
  114. function tripal_pub_AGL_count($search_array) {
  115. $search_str = $search_array['search_string'];
  116. $days = $search_array['days'];
  117. $limit = $search_array['limit'];
  118. $yazc = $_SESSION['tripal_pub_AGL_query'][$search_str]['yaz_connection'];
  119. //yaz_sort($yazc, "1=31 id"); // sort by publication date descending
  120. if (!yaz_search($yazc, "rpn", $search_str)){
  121. $error_no = yaz_errno($yazc);
  122. $error_msg = yaz_error($yazc);
  123. $additional = yaz_addinfo($yazc);
  124. if ($additional != $error_msg) {
  125. $error_msg .= " $additional";
  126. }
  127. drupal_set_message("ERROR preparing search at AGL: ($error_no) $error_msg", "error");
  128. return 0;
  129. }
  130. if (!yaz_wait()) {
  131. $error_no = yaz_errno($yazc);
  132. $error_msg = yaz_error($yazc);
  133. $additional = yaz_addinfo($yazc);
  134. if ($additional != $error_msg) {
  135. $error_msg .= " $additional";
  136. }
  137. drupal_set_message("ERROR waiting on search at AGL: ($error_no) $error_msg", "error");
  138. return 0;
  139. }
  140. // get the total number of results from the serach
  141. $count = yaz_hits($yazc);
  142. $_SESSION['tripal_pub_AGL_query'][$search_str]['Count'] = $count;
  143. return $count;
  144. }
  145. /*
  146. * This function is used as the callback function when used with the
  147. * tripal_pager_callback function. This function returns the results
  148. * within the specified range
  149. */
  150. function tripal_pub_AGL_range($search_array, $start = 0, $limit = 10) {
  151. $pubs = array();
  152. $search_str = $search_array['search_string'];
  153. $days = $search_array['days'];
  154. $limit = $search_array['limit'];
  155. $yazc = $_SESSION['tripal_pub_AGL_query'][$search_str]['yaz_connection'];
  156. $count = $_SESSION['tripal_pub_AGL_query'][$search_str]['Count'];
  157. yaz_range($yazc, 1, $num_pubs);
  158. if (!yaz_present($yazc)) {
  159. $error_no = yaz_errno($yazc);
  160. $error_msg = yaz_error($yazc);
  161. $additional = yaz_addinfo($yazc);
  162. if ($additional != $error_msg) {
  163. $error_msg .= " $additional";
  164. }
  165. drupal_set_message("ERROR waiting on search at AGL: ($error_no) $error_msg", "error");
  166. return $pubs;
  167. }
  168. if ($start + $limit > $count) {
  169. $limit = $count - $start;
  170. }
  171. for($i = $start; $i < $start + $limit; $i++) {
  172. $pub_xml = yaz_record($yazc, $i + 1, 'xml; charset=marc-8,utf-8');
  173. $pub = tripal_pub_AGL_parse_pubxml($pub_xml);
  174. $pubs[] = $pub;
  175. }
  176. return $pubs;
  177. }
  178. /*
  179. * Description of XML format:
  180. * http://www.loc.gov/marc/bibliographic/bdsummary.html
  181. *
  182. */
  183. function tripal_pub_AGL_parse_pubxml($pub_xml) {
  184. $pub = array();
  185. if (!$pub_xml) {
  186. return $pub;
  187. }
  188. // read the XML and iterate through it.
  189. $xml = new XMLReader();
  190. $xml->xml($pub_xml);
  191. while ($xml->read()) {
  192. $element = $xml->name;
  193. if ($xml->nodeType == XMLReader::ELEMENT and $element == 'controlfield') {
  194. $tag = $xml->getAttribute('tag');
  195. $xml->read();
  196. $value = $xml->value;
  197. switch ($tag) {
  198. case '001': // control number
  199. $pub['Publication Accession'] = $value;
  200. break;
  201. case '003': // control number identifier
  202. break;
  203. case '005': // datea nd time of latest transaction
  204. break;
  205. case '006': // fixed-length data elemetns
  206. break;
  207. case '007': // physical description fixed field
  208. break;
  209. case '008': // fixed length data elements
  210. $month = array(
  211. '01' => 'Jan', '02' => 'Feb', '03' => 'Mar',
  212. '04' => 'Apr', '05' => 'May', '06' => 'Jun',
  213. '07' => 'Jul', '08' => 'Aug', '09' => 'Sep',
  214. '10' => 'Oct', '11' => 'Nov', '12' => 'Dec'
  215. );
  216. $date0 = substr($value, 0, 6); // date entered on file
  217. $date1 = substr($value, 7, 4); // year of publication
  218. $date2 = substr($value, 11, 4); // month of publication
  219. $place = substr($value, 15, 3);
  220. $lang = substr($value, 35, 3);
  221. if (preg_match('/\d\d\d\d/', $date1)) {
  222. $pub['Year'] = $date1;
  223. $pub['Publication Date'] = $date1;
  224. }
  225. if (preg_match('/\d\d/', $date2)) {
  226. $pub['Publication Date'] = $date1 . " " . $month[substr($date2, 0, 2)] . " " . substr($date2, 3, 2);
  227. }
  228. if (!preg_match('/\s+/', $place)) {
  229. $pub['Published Location'] = $place;
  230. }
  231. if (!preg_match('/\s+/', $lang)) {
  232. $pub['Language Abbr'] = $lang;
  233. }
  234. break;
  235. default: // unhandled tag
  236. break;
  237. }
  238. }
  239. elseif ($xml->nodeType == XMLReader::ELEMENT and $element == 'datafield') {
  240. $tag = $xml->getAttribute('tag');
  241. $ind1 = $xml->getAttribute('ind1');
  242. $ind2 = $xml->getAttribute('ind2');
  243. switch ($tag) {
  244. case '16': // National Bibliographic Agency Control Number
  245. break;
  246. case '35': // System Control Number
  247. $author = array();
  248. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  249. foreach ($codes as $code => $value) {
  250. switch ($code) {
  251. case 'a': // System control number
  252. $pub['Publication Accession'] = $value;
  253. break;
  254. }
  255. }
  256. case '40': // Cataloging Source (NR)
  257. $author = array();
  258. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  259. foreach ($codes as $code => $value) {
  260. switch ($code) {
  261. case 'a': // original cataolging agency
  262. $pub['Publication Database'] = $value;
  263. break;
  264. }
  265. }
  266. break;
  267. case '72': // Subject Category Code
  268. break;
  269. case '100': // main entry-personal name
  270. $author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
  271. $pub['Author List'][] = $author;
  272. break;
  273. case '110': // main entry-corporate nmae
  274. $author = array();
  275. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  276. foreach ($codes as $code => $value) {
  277. switch ($code) {
  278. case 'a': // Corporate name or jurisdiction name as entry elemen
  279. $author['Collective'] = $value;
  280. break;
  281. case 'b': // Subordinate unit
  282. $author['Collective'] .= ' ' . $value;
  283. break;
  284. }
  285. }
  286. $pub['Author List'][] = $author;
  287. break;
  288. case '111': // main entry-meeting name
  289. break;
  290. case '130': // main entry-uniform title
  291. break;
  292. case '210': // abbreviated title
  293. break;
  294. case '222': // key title
  295. break;
  296. case '240': // uniform title
  297. break;
  298. case '242': // translation of title by cataloging agency
  299. break;
  300. case '243': // collective uniform title
  301. break;
  302. case '245': // title statement
  303. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  304. foreach ($codes as $code => $value) {
  305. switch ($code) {
  306. case 'a':
  307. $pub['Title'] = preg_replace('/\.$/', '', $value);
  308. break;
  309. case 'b':
  310. $pub['Title'] .= ' ' . $value;
  311. break;
  312. case 'h':
  313. $pub['Publication Model'] = $value;
  314. break;
  315. }
  316. }
  317. break;
  318. case '246': // varying form of title
  319. break;
  320. case '247': // former title
  321. break;
  322. case '250': // edition statement
  323. break;
  324. case '254': // musicla presentation statement
  325. break;
  326. case '255': // cartographic mathematical data
  327. break;
  328. case '256': // computer file characteristics
  329. break;
  330. case '257': // country of producing entity
  331. break;
  332. case '258': // philatelic issue data
  333. break;
  334. case '260': // publication, distribution ,etc (imprint)
  335. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  336. foreach ($codes as $code => $value) {
  337. switch ($code) {
  338. case 'a':
  339. $pub['Published Location'] = $value;
  340. break;
  341. case 'b':
  342. $pub['Publisher'] = $value;
  343. break;
  344. case 'c':
  345. $pub['Publication Date'] = $value;
  346. break;
  347. }
  348. }
  349. break;
  350. case '263': // projected publication date
  351. break;
  352. case '264': // production, publication, distribution, manufacture and copyright notice
  353. break;
  354. case '270': // Address
  355. break;
  356. case '300': // Address
  357. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  358. foreach ($codes as $code => $value) {
  359. switch ($code) {
  360. case 'a':
  361. $pages = $value;
  362. $pages = preg_replace('/^p\. /', '', $pages);
  363. $pages = preg_replace('/\.$/', '' , $pages);
  364. if(preg_match('/p$/', $pages)) {
  365. // skip this, it's the number of pages not the page numbers
  366. }
  367. else {
  368. $pub['Pages'] = $pages;
  369. }
  370. break;
  371. }
  372. }
  373. break;
  374. case '500': // series statements
  375. $pub['Notes'] = $value;
  376. break;
  377. case '504': // Bibliography, Etc. Note
  378. break;
  379. case '520': // Summary, etc
  380. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  381. foreach ($codes as $code => $value) {
  382. switch ($code) {
  383. case 'a':
  384. $pub['Abstract'] = $value;
  385. break;
  386. }
  387. }
  388. break;
  389. case '650': // Subject Added Entry-Topical Term
  390. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  391. foreach ($codes as $code => $value) {
  392. switch ($code) {
  393. case 'a':
  394. $pub['Keywords'][] = $value;
  395. break;
  396. }
  397. }
  398. break;
  399. case '653': // Index Term-Uncontrolled
  400. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  401. foreach ($codes as $code => $value) {
  402. switch ($code) {
  403. case 'a':
  404. $pub['Keywords'][] = $value;
  405. break;
  406. }
  407. }
  408. break;
  409. case '700': // Added Entry-Personal Name
  410. $author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
  411. $pub['Author List'][] = $author;
  412. break;
  413. case '710': // Added Entry-Corporate Name
  414. $author = array();
  415. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  416. foreach ($codes as $code => $value) {
  417. switch ($code) {
  418. case 'a': // Corporate name or jurisdiction name as entry elemen
  419. $author['Collective'] = $value;
  420. break;
  421. case 'b': // Subordinate unit
  422. $author['Collective'] .= ' ' . $value;
  423. break;
  424. }
  425. }
  426. $pub['Author List'][] = $author;
  427. break;
  428. case '773': // host item entry
  429. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  430. foreach ($codes as $code => $value) {
  431. switch ($code) {
  432. case 't':
  433. $pub['Journal Name'] = preg_replace('/\.$/', '', $value);
  434. break;
  435. case 'g':
  436. $matches = array();
  437. if (preg_match('/^(\d\d\d\d)/', $value, $matches)) {
  438. $pub['Year'] = $matches[1];
  439. $pub['Publication Date'] = $matches[1];
  440. }
  441. elseif (preg_match('/(.*?)(\.|\s+)\s*(\d+),\s(\d\d\d\d)/', $value, $matches)) {
  442. $year = $matches[4];
  443. $month = $matches[1];
  444. $day = $matches[3];
  445. $pub['Year'] = $year;
  446. $pub['Publication Date'] = "$year $month $day";
  447. }
  448. elseif (preg_match('/\((.*?)(\.|\s+)(\d\d\d\d)\)/', $value, $matches)) {
  449. $year = $matches[3];
  450. $month = $matches[1];
  451. $pub['Year'] = $year;
  452. $pub['Publication Date'] = "$year $month";
  453. }
  454. elseif (preg_match('/^(.*?) (\d\d\d\d)/', $value, $matches)) {
  455. $year = $matches[2];
  456. $month = $matches[1];
  457. $pub['Year'] = $year;
  458. $pub['Publication Date'] = "$year $month";
  459. }
  460. if (preg_match('/v\. (.*?)(,|\s+)/', $value, $matches)) {
  461. $pub['Volume'] = $matches[1];
  462. }
  463. if (preg_match('/v\. (.*?)(,|\s+)\((.*?)\)/', $value, $matches)) {
  464. $pub['Volume'] = $matches[1];
  465. $pub['Issue'] = $matches[3];
  466. }
  467. if (preg_match('/no\. (.*?)(\s|$)/', $value, $matches)) {
  468. $pub['Issue'] = $matches[1];
  469. }
  470. break;
  471. case 'p':
  472. $pub['Journal Abbreviation'] = $value;
  473. break;
  474. case 'z':
  475. $pub['ISBN'] = $value;
  476. break;
  477. }
  478. }
  479. break;
  480. case '852': // Location (Where is the publication held)
  481. break;
  482. case '856': // Electronic Location and Access
  483. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  484. foreach ($codes as $code => $value) {
  485. switch ($code) {
  486. case 'u':
  487. $pub['URL'] = $value;
  488. break;
  489. }
  490. }
  491. break;
  492. default:
  493. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  494. $unhandled[$tag][] = $codes;
  495. break;
  496. }
  497. }
  498. }
  499. //dpm($unhandled);
  500. // build the Dbxref
  501. if ($pub['Publication Database'] != 'AGL') {
  502. }
  503. if ($pub['Publication Accession'] and $pub['Publication Database']) {
  504. $pub['Publication Dbxref'] = $pub['Publication Database'] . ":" . $pub['Publication Accession'];
  505. unset($pub['Publication Accession']);
  506. unset($pub['Publication Database']);
  507. }
  508. // build the full authors list
  509. foreach ($pub['Author List'] as $author) {
  510. if ($author['valid'] == 'N') {
  511. // skip non-valid entries. A non-valid entry should have
  512. // a corresponding corrected entry so we can saftely skip it.
  513. continue;
  514. }
  515. if ($author['Collective']) {
  516. $authors .= $author['Collective'] . ', ';
  517. }
  518. else {
  519. $authors .= $author['Surname'] . ' ' . $author['First Initials'] . ', ';
  520. }
  521. }
  522. $authors = substr($authors, 0, -2);
  523. $pub['Authors'] = $authors;
  524. // build the citation
  525. $pub['Citation'] = tripal_pub_create_citation($pub);
  526. $pub['raw'] = $pub_xml;
  527. return $pub;
  528. }
  529. /*
  530. *
  531. *
  532. */
  533. function tripal_pub_remote_search_AGL_get_subfield($xml) {
  534. $codes = array();
  535. while ($xml->read()) {
  536. $sub_element = $xml->name;
  537. // when we've reached the end of the datafield element then break out of the while loop
  538. if ($xml->nodeType == XMLReader::END_ELEMENT and $sub_element == 'datafield') {
  539. return $codes;
  540. }
  541. // if inside the subfield element then get the code
  542. if ($xml->nodeType == XMLReader::ELEMENT and $sub_element == 'subfield') {
  543. $code = $xml->getAttribute('code');
  544. $xml->read();
  545. $value = $xml->value;
  546. $codes[$code] = $value;
  547. }
  548. }
  549. return $codes;
  550. }
  551. /*
  552. *
  553. *
  554. */
  555. function tripal_pub_remote_search_AGL_get_author($xml, $ind1) {
  556. $author = array();
  557. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  558. foreach ($codes as $code => $value) {
  559. switch ($code) {
  560. case 'a':
  561. // remove any trailing commas
  562. $value = preg_replace('/,$/', '', $value);
  563. if ($ind1 == 0) { // Given Name is first
  564. $author['Given Name'] = $names[0];
  565. }
  566. if ($ind1 == 1) { // Surname is first
  567. // split the parts of the name using a comma
  568. $names = explode(',', $value);
  569. $author['Surname'] = $names[0];
  570. $author['Given Name'] = '';
  571. unset($names[0]);
  572. foreach($names as $index => $name) {
  573. $author['Given Name'] .= $name . ' ';
  574. }
  575. $first_names = explode(' ', $author['Given Name']);
  576. $author['First Initials'] = '';
  577. foreach ($first_names as $index => $name) {
  578. $author['First Initials'] .= substr($name, 0, 1);
  579. }
  580. }
  581. if ($ind1 == 3) { // A family name
  582. }
  583. break;
  584. }
  585. }
  586. return $author;
  587. }