The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.

require 5;
#                        The documentation is at the end.
# Time-stamp: "2004-05-07 15:31:25 ADT"
package Pod::Escapes;
require Exporter;
@ISA = ('Exporter');
$VERSION = '1.04';
@EXPORT_OK = qw(
  %Code2USASCII
  %Name2character
  %Name2character_number
  %Latin1Code_to_fallback
  %Latin1Char_to_fallback
  e2char
  e2charnum
);
%EXPORT_TAGS = ('ALL' => \@EXPORT_OK);

#==========================================================================

use strict;
use vars qw(
  %Code2USASCII
  %Name2character
  %Name2character_number
  %Latin1Code_to_fallback
  %Latin1Char_to_fallback
  $FAR_CHAR
  $FAR_CHAR_NUMBER
  $NOT_ASCII
);

$FAR_CHAR = "?" unless defined $FAR_CHAR;
$FAR_CHAR_NUMBER = ord($FAR_CHAR) unless defined $FAR_CHAR_NUMBER;

$NOT_ASCII = 'A' ne chr(65) unless defined $NOT_ASCII;

#--------------------------------------------------------------------------
sub e2char {
  my $in = $_[0];
  return undef unless defined $in and length $in;
  
  # Convert to decimal:
  if($in =~ m/^(0[0-7]*)$/s ) {
    $in = oct $in;
  } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) {
    $in = hex $1;
  } # else it's decimal, or named

  if($NOT_ASCII) {
    # We're in bizarro world of not-ASCII!
    # Cope with US-ASCII codes, use fallbacks for Latin-1, or use FAR_CHAR.
    unless($in =~ m/^\d+$/s) {
      # It's a named character reference.  Get its numeric Unicode value.
      $in = $Name2character{$in};
      return undef unless defined $in;  # (if there's no such name)
      $in = ord $in; # (All ents must be one character long.)
        # ...So $in holds the char's US-ASCII numeric value, which we'll
        #  now go get the local equivalent for.
    }

    # It's numeric, whether by origin or by mutation from a known name
    return $Code2USASCII{$in} # so "65" => "A" everywhere
        || $Latin1Code_to_fallback{$in} # Fallback.
        || $FAR_CHAR; # Fall further back
  }
  
  # Normal handling:
  if($in =~ m/^\d+$/s) {
    if($] < 5.007  and  $in > 255) { # can't be trusted with Unicode
      return $FAR_CHAR;
    } else {
      return chr($in);
    }
  } else {
    return $Name2character{$in}; # returns undef if unknown
  }
}

#--------------------------------------------------------------------------
sub e2charnum {
  my $in = $_[0];
  return undef unless defined $in and length $in;
  
  # Convert to decimal:
  if($in =~ m/^(0[0-7]*)$/s ) {
    $in = oct $in;
  } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) {
    $in = hex $1;
  } # else it's decimal, or named

  if($in =~ m/^\d+$/s) {
    return 0 + $in;
  } else {
    return $Name2character_number{$in}; # returns undef if unknown
  }
}

#--------------------------------------------------------------------------

%Name2character_number = (
 # General XML/XHTML:
 'lt'   => 60,
 'gt'   => 62,
 'quot' => 34,
 'amp'  => 38,
 'apos' => 39,

 # POD-specific:
 'sol'    => 47,
 'verbar' => 124,

 'lchevron' => 171, # legacy for laquo
 'rchevron' => 187, # legacy for raquo

 # Remember, grave looks like \ (as in virtu\)
 #           acute looks like / (as in re/sume/)
 #           circumflex looks like ^ (as in papier ma^che/)
 #           umlaut/dieresis looks like " (as in nai"ve, Chloe")

 # From the XHTML 1 .ent files:
 'nbsp'     , 160,
 'iexcl'    , 161,
 'cent'     , 162,
 'pound'    , 163,
 'curren'   , 164,
 'yen'      , 165,
 'brvbar'   , 166,
 'sect'     , 167,
 'uml'      , 168,
 'copy'     , 169,
 'ordf'     , 170,
 'laquo'    , 171,
 'not'      , 172,
 'shy'      , 173,
 'reg'      , 174,
 'macr'     , 175,
 'deg'      , 176,
 'plusmn'   , 177,
 'sup2'     , 178,
 'sup3'     , 179,
 'acute'    , 180,
 'micro'    , 181,
 'para'     , 182,
 'middot'   , 183,
 'cedil'    , 184,
 'sup1'     , 185,
 'ordm'     , 186,
 'raquo'    , 187,
 'frac14'   , 188,
 'frac12'   , 189,
 'frac34'   , 190,
 'iquest'   , 191,
 'Agrave'   , 192,
 'Aacute'   , 193,
 'Acirc'    , 194,
 'Atilde'   , 195,
 'Auml'     , 196,
 'Aring'    , 197,
 'AElig'    , 198,
 'Ccedil'   , 199,
 'Egrave'   , 200,
 'Eacute'   , 201,
 'Ecirc'    , 202,
 'Euml'     , 203,
 'Igrave'   , 204,
 'Iacute'   , 205,
 'Icirc'    , 206,
 'Iuml'     , 207,
 'ETH'      , 208,
 'Ntilde'   , 209,
 'Ograve'   , 210,
 'Oacute'   , 211,
 'Ocirc'    , 212,
 'Otilde'   , 213,
 'Ouml'     , 214,
 'times'    , 215,
 'Oslash'   , 216,
 'Ugrave'   , 217,
 'Uacute'   , 218,
 'Ucirc'    , 219,
 'Uuml'     , 220,
 'Yacute'   , 221,
 'THORN'    , 222,
 'szlig'    , 223,
 'agrave'   , 224,
 'aacute'   , 225,
 'acirc'    , 226,
 'atilde'   , 227,
 'auml'     , 228,
 'aring'    , 229,
 'aelig'    , 230,
 'ccedil'   , 231,
 'egrave'   , 232,
 'eacute'   , 233,
 'ecirc'    , 234,
 'euml'     , 235,
 'igrave'   , 236,
 'iacute'   , 237,
 'icirc'    , 238,
 'iuml'     , 239,
 'eth'      , 240,
 'ntilde'   , 241,
 'ograve'   , 242,
 'oacute'   , 243,
 'ocirc'    , 244,
 'otilde'   , 245,
 'ouml'     , 246,
 'divide'   , 247,
 'oslash'   , 248,
 'ugrave'   , 249,
 'uacute'   , 250,
 'ucirc'    , 251,
 'uuml'     , 252,
 'yacute'   , 253,
 'thorn'    , 254,
 'yuml'     , 255,

 'fnof'     , 402,
 'Alpha'    , 913,
 'Beta'     , 914,
 'Gamma'    , 915,
 'Delta'    , 916,
 'Epsilon'  , 917,
 'Zeta'     , 918,
 'Eta'      , 919,
 'Theta'    , 920,
 'Iota'     , 921,
 'Kappa'    , 922,
 'Lambda'   , 923,
 'Mu'       , 924,
 'Nu'       , 925,
 'Xi'       , 926,
 'Omicron'  , 927,
 'Pi'       , 928,
 'Rho'      , 929,
 'Sigma'    , 931,
 'Tau'      , 932,
 'Upsilon'  , 933,
 'Phi'      , 934,
 'Chi'      , 935,
 'Psi'      , 936,
 'Omega'    , 937,
 'alpha'    , 945,
 'beta'     , 946,
 'gamma'    , 947,
 'delta'    , 948,
 'epsilon'  , 949,
 'zeta'     , 950,
 'eta'      , 951,
 'theta'    , 952,
 'iota'     , 953,
 'kappa'    , 954,
 'lambda'   , 955,
 'mu'       , 956,
 'nu'       , 957,
 'xi'       , 958,
 'omicron'  , 959,
 'pi'       , 960,
 'rho'      , 961,
 'sigmaf'   , 962,
 'sigma'    , 963,
 'tau'      , 964,
 'upsilon'  , 965,
 'phi'      , 966,
 'chi'      , 967,
 'psi'      , 968,
 'omega'    , 969,
 'thetasym' , 977,
 'upsih'    , 978,
 'piv'      , 982,
 'bull'     , 8226,
 'hellip'   , 8230,
 'prime'    , 8242,
 'Prime'    , 8243,
 'oline'    , 8254,
 'frasl'    , 8260,
 'weierp'   , 8472,
 'image'    , 8465,
 'real'     , 8476,
 'trade'    , 8482,
 'alefsym'  , 8501,
 'larr'     , 8592,
 'uarr'     , 8593,
 'rarr'     , 8594,
 'darr'     , 8595,
 'harr'     , 8596,
 'crarr'    , 8629,
 'lArr'     , 8656,
 'uArr'     , 8657,
 'rArr'     , 8658,
 'dArr'     , 8659,
 'hArr'     , 8660,
 'forall'   , 8704,
 'part'     , 8706,
 'exist'    , 8707,
 'empty'    , 8709,
 'nabla'    , 8711,
 'isin'     , 8712,
 'notin'    , 8713,
 'ni'       , 8715,
 'prod'     , 8719,
 'sum'      , 8721,
 'minus'    , 8722,
 'lowast'   , 8727,
 'radic'    , 8730,
 'prop'     , 8733,
 'infin'    , 8734,
 'ang'      , 8736,
 'and'      , 8743,
 'or'       , 8744,
 'cap'      , 8745,
 'cup'      , 8746,
 'int'      , 8747,
 'there4'   , 8756,
 'sim'      , 8764,
 'cong'     , 8773,
 'asymp'    , 8776,
 'ne'       , 8800,
 'equiv'    , 8801,
 'le'       , 8804,
 'ge'       , 8805,
 'sub'      , 8834,
 'sup'      , 8835,
 'nsub'     , 8836,
 'sube'     , 8838,
 'supe'     , 8839,
 'oplus'    , 8853,
 'otimes'   , 8855,
 'perp'     , 8869,
 'sdot'     , 8901,
 'lceil'    , 8968,
 'rceil'    , 8969,
 'lfloor'   , 8970,
 'rfloor'   , 8971,
 'lang'     , 9001,
 'rang'     , 9002,
 'loz'      , 9674,
 'spades'   , 9824,
 'clubs'    , 9827,
 'hearts'   , 9829,
 'diams'    , 9830,
 'OElig'    , 338,
 'oelig'    , 339,
 'Scaron'   , 352,
 'scaron'   , 353,
 'Yuml'     , 376,
 'circ'     , 710,
 'tilde'    , 732,
 'ensp'     , 8194,
 'emsp'     , 8195,
 'thinsp'   , 8201,
 'zwnj'     , 8204,
 'zwj'      , 8205,
 'lrm'      , 8206,
 'rlm'      , 8207,
 'ndash'    , 8211,
 'mdash'    , 8212,
 'lsquo'    , 8216,
 'rsquo'    , 8217,
 'sbquo'    , 8218,
 'ldquo'    , 8220,
 'rdquo'    , 8221,
 'bdquo'    , 8222,
 'dagger'   , 8224,
 'Dagger'   , 8225,
 'permil'   , 8240,
 'lsaquo'   , 8249,
 'rsaquo'   , 8250,
 'euro'     , 8364,
);


# Fill out %Name2character...
{
  %Name2character = ();
  my($name, $number);
  while( ($name, $number) = each %Name2character_number) {
    if($] < 5.007  and  $number > 255) {
      $Name2character{$name} = $FAR_CHAR;
      # substitute for Unicode characters, for perls
      #  that can't reliable handle them
    } else {
      $Name2character{$name} = chr $number;
      # normal case
    }
  }
  # So they resolve 'right' even in EBCDIC-land
  $Name2character{'lt'  }   = '<';
  $Name2character{'gt'  }   = '>';
  $Name2character{'quot'}   = '"';
  $Name2character{'amp' }   = '&';
  $Name2character{'apos'}   = "'";
  $Name2character{'sol' }   = '/';
  $Name2character{'verbar'} = '|';
}

#--------------------------------------------------------------------------

%Code2USASCII = (
# mostly generated by
#  perl -e "printf qq{  \x25 3s, '\x25s',\n}, $_, chr($_) foreach (32 .. 126)"
   32, ' ',
   33, '!',
   34, '"',
   35, '#',
   36, '$',
   37, '%',
   38, '&',
   39, "'", #!
   40, '(',
   41, ')',
   42, '*',
   43, '+',
   44, ',',
   45, '-',
   46, '.',
   47, '/',
   48, '0',
   49, '1',
   50, '2',
   51, '3',
   52, '4',
   53, '5',
   54, '6',
   55, '7',
   56, '8',
   57, '9',
   58, ':',
   59, ';',
   60, '<',
   61, '=',
   62, '>',
   63, '?',
   64, '@',
   65, 'A',
   66, 'B',
   67, 'C',
   68, 'D',
   69, 'E',
   70, 'F',
   71, 'G',
   72, 'H',
   73, 'I',
   74, 'J',
   75, 'K',
   76, 'L',
   77, 'M',
   78, 'N',
   79, 'O',
   80, 'P',
   81, 'Q',
   82, 'R',
   83, 'S',
   84, 'T',
   85, 'U',
   86, 'V',
   87, 'W',
   88, 'X',
   89, 'Y',
   90, 'Z',
   91, '[',
   92, "\\", #!
   93, ']',
   94, '^',
   95, '_',
   96, '`',
   97, 'a',
   98, 'b',
   99, 'c',
  100, 'd',
  101, 'e',
  102, 'f',
  103, 'g',
  104, 'h',
  105, 'i',
  106, 'j',
  107, 'k',
  108, 'l',
  109, 'm',
  110, 'n',
  111, 'o',
  112, 'p',
  113, 'q',
  114, 'r',
  115, 's',
  116, 't',
  117, 'u',
  118, 'v',
  119, 'w',
  120, 'x',
  121, 'y',
  122, 'z',
  123, '{',
  124, '|',
  125, '}',
  126, '~',
);

#--------------------------------------------------------------------------

%Latin1Code_to_fallback = ();
@Latin1Code_to_fallback{0xA0 .. 0xFF} = (
# Copied from Text/Unidecode/x00.pm:

' ', qq{!}, qq{C/}, 'PS', qq{\$?}, qq{Y=}, qq{|}, 'SS', qq{"}, qq{(c)}, 'a', qq{<<}, qq{!}, "", qq{(r)}, qq{-},
'deg', qq{+-}, '2', '3', qq{'}, 'u', 'P', qq{*}, qq{,}, '1', 'o', qq{>>}, qq{1/4}, qq{1/2}, qq{3/4}, qq{?},
'A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x', 'O', 'U', 'U', 'U', 'U', 'U', 'Th', 'ss',
'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
'd', 'n', 'o', 'o', 'o', 'o', 'o', qq{/}, 'o', 'u', 'u', 'u', 'u', 'y', 'th', 'y',

);

{
  # Now stuff %Latin1Char_to_fallback:
  %Latin1Char_to_fallback = ();
  my($k,$v);
  while( ($k,$v) = each %Latin1Code_to_fallback) {
    $Latin1Char_to_fallback{chr $k} = $v;
    #print chr($k), ' => ', $v, "\n";
  }
}

#--------------------------------------------------------------------------
1;
__END__

=head1 NAME

Pod::Escapes -- for resolving Pod EE<lt>...E<gt> sequences

=head1 SYNOPSIS

  use Pod::Escapes qw(e2char);
  ...la la la, parsing POD, la la la...
  $text = e2char($e_node->label);
  unless(defined $text) {
    print "Unknown E sequence \"", $e_node->label, "\"!";
  }
  ...else print/interpolate $text...

=head1 DESCRIPTION

This module provides things that are useful in decoding
Pod EE<lt>...E<gt> sequences.  Presumably, it should be used
only by Pod parsers and/or formatters.

By default, Pod::Escapes exports none of its symbols.  But
you can request any of them to be exported.
Either request them individually, as with
C<use Pod::Escapes qw(symbolname symbolname2...);>,
or you can do C<use Pod::Escapes qw(:ALL);> to get all
exportable symbols.

=head1 GOODIES

=over

=item e2char($e_content)

Given a name or number that could appear in a
C<EE<lt>name_or_numE<gt>> sequence, this returns the string that
it stands for.  For example, C<e2char('sol')>, C<e2char('47')>,
C<e2char('0x2F')>, and C<e2char('057')> all return "/",
because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>,
and C<EE<lt>057E<gt>>, all mean "/".  If
the name has no known value (as with a name of "qacute") or is
syntactally invalid (as with a name of "1/4"), this returns undef.

=item e2charnum($e_content)

Given a name or number that could appear in a
C<EE<lt>name_or_numE<gt>> sequence, this returns the number of
the Unicode character that this stands for.  For example,
C<e2char('sol')>, C<e2char('47')>,
C<e2char('0x2F')>, and C<e2char('057')> all return 47,
because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>,
and C<EE<lt>057E<gt>>, all mean "/", whose Unicode number is 47.  If
the name has no known value (as with a name of "qacute") or is
syntactally invalid (as with a name of "1/4"), this returns undef.

=item $Name2character{I<name>}

Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol"
to the string that each stands for.  Note that this does not
include numerics (like "64" or "x981c").  Under old Perl versions
(before 5.7) you get a "?" in place of characters whose Unicode
value is over 255.

=item $Name2character_number{I<name>}

Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol"
to the Unicode value that each stands for.  For example,
C<$Name2character_number{'eacute'}> is 201, and
C<$Name2character_number{'eacute'}> is 8364.  You get the correct
Unicode value, regardless of the version of Perl you're using --
which differs from C<%Name2character>'s behavior under pre-5.7 Perls.

Note that this hash does not
include numerics (like "64" or "x981c").

=item $Latin1Code_to_fallback{I<integer>}

For numbers in the range 160 (0x00A0) to 255 (0x00FF), this maps
from the character code for a Latin-1 character (like 233 for
lowercase e-acute) to the US-ASCII character that best aproximates
it (like "e").  You may find this useful if you are rendering
POD in a format that you think deals well only with US-ASCII
characters.

=item $Latin1Char_to_fallback{I<character>}

Just as above, but maps from characters (like "\xE9", 
lowercase e-acute) to characters (like "e").

=item $Code2USASCII{I<integer>}

This maps from US-ASCII codes (like 32) to the corresponding
character (like space, for 32).  Only characters 32 to 126 are
defined.  This is meant for use by C<e2char($x)> when it senses
that it's running on a non-ASCII platform (where chr(32) doesn't
get you a space -- but $Code2USASCII{32} will).  It's
documented here just in case you might find it useful.

=back

=head1 CAVEATS

On Perl versions before 5.7, Unicode characters with a value
over 255 (like lambda or emdash) can't be conveyed.  This
module does work under such early Perl versions, but in the
place of each such character, you get a "?".  Latin-1
characters (characters 160-255) are unaffected.

Under EBCDIC platforms, C<e2char($n)> may not always be the
same as C<chr(e2charnum($n))>, and ditto for
C<$Name2character{$name}> and
C<chr($Name2character_number{$name})>.

=head1 SEE ALSO

L<perlpod|perlpod>

L<perlpodspec|perlpodspec>

L<Text::Unidecode|Text::Unidecode>

=head1 COPYRIGHT AND DISCLAIMERS

Copyright (c) 2001-2004 Sean M. Burke.  All rights reserved.

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

This program is distributed in the hope that it will be useful, but
without any warranty; without even the implied warranty of
merchantability or fitness for a particular purpose.

Portions of the data tables in this module are derived from the
entity declarations in the W3C XHTML specification.

Currently (October 2001), that's these three:

 http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent
 http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent
 http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent

=head1 AUTHOR

Sean M. Burke C<sburke@cpan.org>

=cut

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# What I used for reading the XHTML .ent files:

use strict;
my(@norms, @good, @bad);
my $dir = 'c:/sgml/docbook/';
my %escapes;
foreach my $file (qw(
  xhtml-symbol.ent
  xhtml-lat1.ent
  xhtml-special.ent
)) {
  open(IN, "<$dir$file") or die "can't read-open $dir$file: $!";
  print "Reading $file...\n";
  while(<IN>) {
    if(m/<!ENTITY\s+(\S+)\s+"&#([^;]+);">/) {
      my($name, $value) = ($1,$2);
      next if $name eq 'quot' or $name eq 'apos' or $name eq 'gt';
    
      $value = hex $1 if $value =~ m/^x([a-fA-F0-9]+)$/s;
      print "ILLEGAL VALUE $value" unless $value =~ m/^\d+$/s;
      if($value > 255) {
        push @good , sprintf "   %-10s , chr(%s),\n", "'$name'", $value;
        push @bad  , sprintf "   %-10s , \$bad,\n", "'$name'", $value;
      } else {
        push @norms, sprintf " %-10s , chr(%s),\n", "'$name'", $value;
      }
    } elsif(m/<!ENT/) {
      print "# Skipping $_";
    }
  
  }
  close(IN);
}

print @norms;
print "\n ( \$] .= 5.006001 ? (\n";
print @good;
print " ) : (\n";
print @bad;
print " )\n);\n";

__END__
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~