# $Id: Utils.pm,v 1.35 2007/06/28 13:59:31 sondberg Exp $
package ZOOM::IRSpy::Utils;
use 5.008;
use strict;
use warnings;
use Exporter 'import';
our @EXPORT_OK = qw(utf8param
isodate
xml_encode
cql_quote
cql_target
irspy_xpath_context
irspy_make_identifier
irspy_record2identifier
irspy_identifier2target
modify_xml_document
bib1_access_point
render_record);
use XML::LibXML;
use XML::LibXML::XPathContext;
use Encode;
use Encode qw(is_utf8);
our $IRSPY_NS = 'http://indexdata.com/irspy/1.0';
# Utility functions follow, exported for use of web UI
sub utf8param {
my($r, $key, $value) = @_;
die "utf8param() called with value '$value'" if defined $value;
my $raw = $r->param($key);
return undef if !defined $raw;
my $cooked = decode_utf8($raw);
warn "converted '$raw' to '", $cooked, "'\n" if $cooked ne $raw;
return $cooked;
}
sub isodate {
my($time) = @_;
my($sec, $min, $hour, $mday, $mon, $year) = localtime($time);
return sprintf("%04d-%02d-%02dT%02d:%02d:%02d",
$year+1900, $mon+1, $mday, $hour, $min, $sec);
}
# I can't -- just can't, can't, can't -- believe that this function
# isn't provided by one of the core XML modules. But the evidence all
# says that it's not: among other things, XML::Generator and
# Template::Plugin both roll their own. So I will do likewise. D'oh!
#
sub xml_encode {
my($text, $fallback, $opts) = @_;
if (!defined $opts && ref $fallback) {
# The second and third arguments are both optional
$opts = $fallback;
$fallback = undef;
}
$opts = {} if !defined $opts;
$text = $fallback if !defined $text;
use Carp;
confess "xml_encode(): text and fallback both undefined"
if !defined $text;
$text =~ s/&/&/g;
$text =~ s/</</g;
$text =~ s/>/>/g;
# Internet Explorer can't display ' (!) so don't create it
#$text =~ s/['']/'/g;
$text =~ s/[""]/"/g;
$text =~ s/ / /g if $opts->{nbsp};
return $text;
}
# Quotes a term for use in a CQL query
sub cql_quote {
my($term) = @_;
$term =~ s/([""\\*?])/\\$1/g;
$term = qq["$term"] if $term =~ /[\s""\/]/;
return $term;
}
# Makes a CQL query that finds a specified target. Arguments may be
# either an ID alone, or a (host, port, db) triple.
sub cql_target {
my($protocol, $host, $port, $db) = @_;
my $id;
if (defined $host) {
$id = irspy_make_identifier($protocol, $host, $port, $db);
} else {
$id = $protocol;
}
return "rec.id=" . cql_quote($id);
}
# PRIVATE to irspy_namespace() and irspy_xpath_context()
my %_namespaces = (
e => 'http://explain.z3950.org/dtd/2.0/',
i => $IRSPY_NS,
);
sub irspy_namespace {
my($prefix) = @_;
use Carp;
confess "irspy_namespace(undef)" if !defined $prefix;
my $uri = $_namespaces{$prefix};
die "irspy_namespace(): no URI for namespace prefix '$prefix'"
if !defined $uri;
return $uri;
}
sub irspy_xpath_context {
my($record) = @_;
if (ref $record && $record->isa("ZOOM::Record")) {
$record = $record->render();
}
my $root;
if (ref $record) {
$root = $record;
} else {
my $parser = new XML::LibXML();
my $doc = $parser->parse_string($record);
$root = $doc->getDocumentElement();
}
my $xc = XML::LibXML::XPathContext->new($root);
foreach my $prefix (keys %_namespaces) {
$xc->registerNs($prefix, $_namespaces{$prefix});
}
return $xc;
}
# Construct an opaque identifier from its components. Although it's
# trivial, this is needed in so many places that it really needs to be
# factored out.
#
# This is the converse of _parse_target_string() in IRSpy.pm, which
# should be renamed and moved into this package.
#
sub irspy_make_identifier {
my($protocol, $host, $port, $dbname) = @_;
die "irspy_make_identifier(" . join(", ", map { "'$_'" } @_).
"): wrong number of arguments" if @_ != 4;
die "irspy_make_identifier(): protocol undefined" if !defined $protocol;
die "irspy_make_identifier(): host undefined" if !defined $host;
die "irspy_make_identifier(): port undefined" if !defined $port;
die "irspy_make_identifier(): dbname undefined" if !defined $dbname;
return "$protocol:$host:$port/$dbname";
}
# Returns the opaque identifier of an IRSpy record based on the
# XPathContext'ed DOM object, as returned by irspy_xpath_context().
# This is doing the same thing as irspy_make_identifier() but from a
# record rather than a set of parameters.
#
sub irspy_record2identifier {
my($xc) = @_;
### Must be kept the same as is used in ../../../zebra/*.xsl
return $xc->find("concat(e:serverInfo/\@protocol, ':',
e:serverInfo/e:host, ':',
e:serverInfo/e:port, '/',
e:serverInfo/e:database)");
}
# Transforms an IRSpy opqaue identifier, as returned from
# irspy_make_identifier() or irspy_record2identifier(), into a YAZ
# target-string suitable for feeding to ZOOM. Before we introduced
# the protocol element at the start of the identifier string, this was
# a null transform; now we have to be a bit cleverer.
#
sub irspy_identifier2target {
my $res = _irspy_identifier2target(@_);
#carp "converted ID '@_' to target '$res'";
return $res;
}
sub _irspy_identifier2target {
my($id) = @_;
my($protocol, $target) = ($id =~ /(.*?):(.*)/);
if (uc($protocol) eq "Z39.50") {
return "tcp:$target";
} elsif (uc($protocol) eq "SRU") {
return "sru=get,http:$target";
} elsif (uc($protocol) eq "SRW") {
return "sru=srw,http:$target";
}
warn "unrecognised protocol '$protocol' in ID $id";
return $target;
}
sub modify_xml_document {
my($xc, $fieldsByKey, $data) = @_;
my @changes = ();
foreach my $key (keys %$data) {
my $value = $data->{$key};
my $ref = $fieldsByKey->{$key} or die "no field '$key'";
my($name, $nlines, $caption, $xpath, @addAfter) = @$ref;
#print "Considering $key='$value' ($xpath)<br/>\n";
my @nodes = $xc->findnodes($xpath);
if (@nodes) {
warn scalar(@nodes), " nodes match '$xpath'" if @nodes > 1;
my $node = $nodes[0];
if ($node->isa("XML::LibXML::Attr")) {
if ($value ne $node->getValue()) {
$node->setValue($value);
push @changes, $ref;
#print "Attr $key: '", $node->getValue(), "' -> '$value' ($xpath)<br/>\n";
}
} elsif ($node->isa("XML::LibXML::Element")) {
# The contents could be any mixture of text and
# comments and maybe even other crud such as processing
# instructions. The simplest thing is just to throw it all
# away and start again, making a single Text node the
# canonical representation. But before we do that,
# we'll check whether the element is already
# canonical, to determine whether our change is a
# no-op.
my $old = "";
my @children = $node->childNodes();
if (@children == 1) {
my $child = $node->firstChild();
if (ref $child && ref $child eq "XML::LibXML::Text") {
$old = $child->getData();
#print STDERR "child='$child', old=", _renderchars($old), "\n" if $key eq "title";
}
}
next if $value eq $old;
$node->removeChildNodes();
my $child = new XML::LibXML::Text($value);
$node->appendChild($child);
push @changes, $ref;
#print STDERR "Elem $key ($xpath): ", _renderchars($old), " -> '", _renderchars($value), "\n";
} else {
warn "unexpected node type $node";
}
} else {
next if !$value; # No need to create a new empty node
my($ppath, $selector) = $xpath =~ /(.*)\/(.*)/;
dom_add_node($xc, $ppath, $selector, $value, @addAfter);
#print "New $key ($xpath) = '$value'<br/>\n";
push @changes, $ref;
}
}
return @changes;
}
sub _renderchars {
my($text) = @_;
return "'" . $text . "'", " (", join(" ", map {ord($_)} split //, $text), "), is_utf8=" , is_utf8($text);
}
sub dom_add_node {
my($xc, $ppath, $selector, $value, @addAfter) = @_;
#print "Adding $selector='$value' at '$ppath' after (", join(", ", map { "'$_'" } @addAfter), ")<br/>\n";
my $node = find_or_make_node($xc, $ppath, 0);
die "couldn't find or make node '$node'" if !defined $node;
my $is_attr = ($selector =~ s/^@//);
my(undef, $prefix, $simpleSel) = $selector =~ /((.*?):)?(.*)/;
#warn "selector='$selector', prefix='$prefix', simpleSel='$simpleSel'";
if ($is_attr) {
if (defined $prefix) {
### This seems to no-op (thank, DOM!) but I have have no
# idea, and it's not needed for IRSpy, so I am not going
# to debug it now.
$node->setAttributeNS(irspy_namespace($prefix),
$simpleSel, $value);
} else {
$node->setAttribute($simpleSel, $value);
}
return;
}
my $new = new XML::LibXML::Element($simpleSel);
$new->setNamespace(irspy_namespace($prefix), $prefix)
if defined $prefix;
$new->appendText($value);
foreach my $predecessor (reverse @addAfter) {
my($child) = $xc->findnodes($predecessor, $node);
if (defined $child) {
$node->insertAfter($new, $child);
#warn "Added after '$predecessor'";
return;
}
}
# Didn't find any of the nodes that are supposed to precede the
# new one, so we need to insert the new node as the first of the
# parent's children. However *sigh* there is no prependChild()
# analogous to appendChild(), so we have to go the long way round.
my @children = $node->childNodes();
if (@children) {
$node->insertBefore($new, $children[0]);
#warn "Added new first child";
} else {
$node->appendChild($new);
#warn "Added new only child";
}
if (0) {
my $text = xml_encode(inheritance_tree($xc));
$text =~ s/\n/<br\/>$&/sg;
print "<pre>$text</pre>\n";
}
}
sub find_or_make_node {
my($xc, $path, $recursion_level) = @_;
die "deep recursion in find_or_make_node($path)"
if $recursion_level == 10;
$path = "." if $path eq "";
my @nodes = $xc->findnodes($path);
if (@nodes == 0) {
# Oh dear, the parent node doesn't exist. We could make it,
my(undef, $ppath, $element) = $path =~ /((.*)\/)?(.*)/;
$ppath = "" if !defined $ppath;
#warn "path='$path', ppath='$ppath', element='$element'";
#warn "no node '$path': making it";
my $parent = find_or_make_node($xc, $ppath, $recursion_level-1);
my(undef, $prefix, $nsElem) = $element =~ /((.*?):)?(.*)/;
#warn "element='$element', prefix='$prefix', nsElem='$nsElem'";
my $new = new XML::LibXML::Element($nsElem);
if (defined $prefix) {
#warn "setNamespace($prefix)";
$new->setNamespace(irspy_namespace($prefix), $prefix);
}
$parent->appendChild($new);
return $new;
}
warn scalar(@nodes), " nodes match parent '$path'" if @nodes > 1;
return $nodes[0];
}
sub inheritance_tree {
my($type, $level) = @_;
$level = 0 if !defined $level;
return "Woah! Too deep, man!\n" if $level > 20;
$type = ref $type if ref $type;
my $text = "";
$text = "--> " if $level == 0;
$text .= ("\t" x $level) . "$type\n";
my @ISA = eval "\@${type}::ISA";
foreach my $superclass (@ISA) {
$text .= inheritance_tree($superclass, $level+1);
}
return $text;
}
# This function is made available in xslt using the register_function call
sub xslt_strcmp {
my ($arg1, $arg2) = @_;
return "$arg1" cmp "$arg2";
}
### It feels like this should be in YAZ, exported via ZOOM-Perl.
my %_bib1_access_point = (
1 => "Personal name",
2 => "Corporate name",
3 => "Conference name",
4 => "Title",
5 => "Title series",
6 => "Title uniform",
7 => "ISBN",
8 => "ISSN",
9 => "LC card number",
10 => "BNB card no.",
11 => "BGF number",
12 => "Local number",
13 => "Dewey classification",
14 => "UDC classification",
15 => "Bliss classification",
16 => "LC call number",
17 => "NLM call number",
18 => "NAL call number",
19 => "MOS call number",
20 => "Local classification",
21 => "Subject heading",
22 => "Subject Rameau",
23 => "BDI index subject",
24 => "INSPEC subject",
25 => "MESH subject",
26 => "PA subject",
27 => "LC subject heading",
28 => "RVM subject heading",
29 => "Local subject index",
30 => "Date",
31 => "Date of publication",
32 => "Date of acquisition",
33 => "Title key",
34 => "Title collective",
35 => "Title parallel",
36 => "Title cover",
37 => "Title added title page",
38 => "Title caption",
39 => "Title running",
40 => "Title spine",
41 => "Title other variant",
42 => "Title former",
43 => "Title abbreviated",
44 => "Title expanded",
45 => "Subject precis",
46 => "Subject rswk",
47 => "Subject subdivision",
48 => "No. nat'l biblio.",
49 => "No. legal deposit",
50 => "No. govt pub.",
51 => "No. music publisher",
52 => "Number db",
53 => "Number local call",
54 => "Code--language",
55 => "Code--geographic area",
56 => "Code--institution",
57 => "Name and title *",
58 => "Name geographic",
59 => "Place publication",
60 => "CODEN",
61 => "Microform generation",
62 => "Abstract",
63 => "Note",
1000 => "Author-title",
1001 => "Record type",
1002 => "Name",
1003 => "Author",
1004 => "Author-name personal",
1005 => "Author-name corporate",
1006 => "Author-name conference",
1007 => "Identifier--standard",
1008 => "Subject--LC children's",
1009 => "Subject name -- personal",
1010 => "Body of text",
1011 => "Date/time added to db",
1012 => "Date/time last modified",
1013 => "Authority/format id",
1014 => "Concept-text",
1015 => "Concept-reference",
1016 => "Any",
1017 => "Server-choice",
1018 => "Publisher",
1019 => "Record-source",
1020 => "Editor",
1021 => "Bib-level",
1022 => "Geographic-class",
1023 => "Indexed-by",
1024 => "Map-scale",
1025 => "Music-key",
1026 => "Related-periodical",
1027 => "Report-number",
1028 => "Stock-number",
1030 => "Thematic-number",
1031 => "Material-type",
1032 => "Doc-id",
1033 => "Host-item",
1034 => "Content-type",
1035 => "Anywhere",
1036 => "Author-Title-Subject",
1032 => "Doc-id (semantic definition change)",
1037 => "SICI",
1038 => "Abstract-language",
1039 => "Application-kind",
1040 => "Classification",
1041 => "Classification-basic",
1042 => "Classification-local-record",
1043 => "Enzyme",
1044 => "Possessing-institution",
1045 => "Record-linking",
1046 => "Record-status",
1047 => "Treatment",
1048 => "Control-number-GKD",
1049 => "Control-number-linking",
1050 => "Control-number-PND",
1051 => "Control-number-SWD",
1052 => "Control-number-ZDB",
1053 => "Country-publication (country of Publication)",
1054 => "Date-conference (meeting date)",
1055 => "Date-record-status",
1056 => "Dissertation-information",
1057 => "Meeting-organizer",
1058 => "Note-availability",
1059 => "Number-CAS-registry (CAS registry number)",
1060 => "Number-document (document number)",
1061 => "Number-local-accounting",
1062 => "Number-local-acquisition",
1063 => "Number-local-call-copy-specific",
1064 => "Number-of-reference (reference count)",
1065 => "Number-norm",
1066 => "Number-volume",
1067 => "Place-conference (meeting location)",
1068 => "Reference (references and footnotes)",
1069 => "Referenced-journal (reference work)",
1070 => "Section-code",
1071 => "Section-heading",
1072 => "Subject-GOO",
1073 => "Subject-name-conference",
1074 => "Subject-name-corporate",
1075 => "Subject-genre/form",
1076 => "Subject-name-geographical",
1077 => "Subject--chronological",
1078 => "Subject--title",
1079 => "Subject--topical",
1080 => "Subject-uncontrolled",
1081 => "Terminology-chemical (chemical name)",
1082 => "Title-translated",
1083 => "Year-of-beginning",
1084 => "Year-of-ending",
1085 => "Subject-AGROVOC",
1086 => "Subject-COMPASS",
1087 => "Subject-EPT",
1088 => "Subject-NAL",
1089 => "Classification-BCM",
1090 => "Classification-DB",
1091 => "Identifier-ISRC",
1092 => "Identifier-ISMN",
1093 => "Identifier-ISRN",
1094 => "Identifier-DOI",
1095 => "Code-language-original",
1096 => "Title-later",
1097 => "DC-Title",
1098 => "DC-Creator",
1099 => "DC-Subject",
1100 => "DC-Description",
1101 => "DC-Publisher",
1102 => "DC-Date",
1103 => "DC-ResourceType",
1104 => "DC-ResourceIdentifier",
1105 => "DC-Language",
1106 => "DC-OtherContributor",
1107 => "DC-Format",
1108 => "DC-Source",
1109 => "DC-Relation",
1110 => "DC-Coverage",
1111 => "DC-RightsManagement",
1112 => "Controlled Subject Index",
1113 => "Subject Thesaurus",
1114 => "Index Terms -- Controlled",
1115 => "Controlled Term",
1116 => "Spatial Domain",
1117 => "Bounding Coordinates",
1118 => "West Bounding Coordinate",
1119 => "East Bounding Coordinate",
1120 => "North Bounding Coordinate",
1121 => "South Bounding Coordinate",
1122 => "Place",
1123 => "Place Keyword Thesaurus",
1124 => "Place Keyword",
1125 => "Time Period",
1126 => "Time Period Textual",
1127 => "Time Period Structured",
1128 => "Beginning Date",
1129 => "Ending Date",
1130 => "Availability",
1131 => "Distributor",
1132 => "Distributor Name",
1133 => "Distributor Organization",
1134 => "Distributor Street Address",
1135 => "Distributor City",
1136 => "Distributor State or Province",
1137 => "Distributor Zip or Postal Code",
1138 => "Distributor Country",
1139 => "Distributor Network Address",
1140 => "Distributor Hours of Service",
1141 => "Distributor Telephone",
1142 => "Distributor Fax",
1143 => "Resource Description",
1144 => "Order Process",
1145 => "Order Information",
1146 => "Cost",
1147 => "Cost Information",
1148 => "Technical Prerequisites",
1149 => "Available Time Period",
1150 => "Available Time Textual",
1151 => "Available Time Structured",
1152 => "Available Linkage",
1153 => "Linkage Type",
1154 => "Linkage",
1155 => "Sources of Data",
1156 => "Methodology",
1157 => "Access Constraints",
1158 => "General Access Constraints",
1159 => "Originator Dissemination Control",
1160 => "Security Classification Control",
1161 => "Use Constraints",
1162 => "Point of Contact",
1163 => "Contact Name",
1164 => "Contact Organization",
1165 => "Contact Street Address",
1166 => "Contact City",
1167 => "Contact State or Province",
1168 => "Contact Zip or Postal Code",
1169 => "Contact Country",
1170 => "Contact Network Address",
1171 => "Contact Hours of Service",
1172 => "Contact Telephone",
1173 => "Contact Fax",
1174 => "Supplemental Information",
1175 => "Purpose",
1176 => "Agency Program",
1177 => "Cross Reference",
1178 => "Cross Reference Title",
1179 => "Cross Reference Relationship",
1180 => "Cross Reference Linkage",
1181 => "Schedule Number",
1182 => "Original Control Identifier",
1183 => "Language of Record",
1184 => "Record Review Date",
1185 => "Performer",
1186 => "Performer-Individual",
1187 => "Performer-Group",
1188 => "Instrumentation",
1189 => "Instrumentation-Original",
1190 => "Instrumentation-Current",
1191 => "Arrangement",
1192 => "Arrangement-Original",
1193 => "Arrangement-Current",
1194 => "Musical Key-Original",
1195 => "Musical Key-Current",
1196 => "Date-Composition",
1197 => "Date-Recording",
1198 => "Place-Recording",
1199 => "Country-Recording",
1200 => "Number-ISWC",
1201 => "Number-Matrix",
1202 => "Number-Plate",
1203 => "Classification-McColvin",
1204 => "Duration",
1205 => "Number-Copies",
1206 => "Musical Theme",
1207 => "Instruments - total number",
1208 => "Instruments - distinct number",
1209 => "Identifier - URN",
1210 => "Sears Subject Heading",
1211 => "OCLC Number",
1212 => "Composition",
1213 => "Intellectual level",
1214 => "EAN",
1215 => "NLC",
1216 => "CRCS",
1217 => "Nationality",
1218 => "Equinox",
1219 => "Compression",
1220 => "Format",
1221 => "Subject - occupation",
1222 => "Subject - function",
1223 => "Edition",
);
sub bib1_access_point {
my($ap) = @_;
return $_bib1_access_point{$ap} ||
"unknown BIB-1 attribute '$ap'";
}
sub render_record {
my($rs, $which, $elementSetName) = @_;
# There is a slight race condition here on the element-set name,
# but it shouldn't be a problem as this is (currently) only called
# from parts of the program that run single-threaded.
my $old = $rs->option(elementSetName => $elementSetName);
my $rec = $rs->record($which);
$rs->option(elementSetName => $old);
return $rec->render();
}
1;