package Bio::Phylo::NeXML::Entities;
use strict;
use warnings;
use base 'Exporter';
our @EXPORT_OK = qw'encode_entities decode_entities';
my %entity2char = (
# Some normal chars that have special meaning in SGML context
'&' => '&', # ampersand
'>' => '>', # greater than
'<' => '<', # less than
'"' => '"', # double quote
''' => "'", # single quote
# PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
'Æ' => chr(198), # capital AE diphthong (ligature)
'Á' => chr(193), # capital A, acute accent
'Â' => chr(194), # capital A, circumflex accent
'À' => chr(192), # capital A, grave accent
'Å' => chr(197), # capital A, ring
'Ã' => chr(195), # capital A, tilde
'Ä' => chr(196), # capital A, dieresis or umlaut mark
'Ç' => chr(199), # capital C, cedilla
'Ð' => chr(208), # capital Eth, Icelandic
'É' => chr(201), # capital E, acute accent
'Ê' => chr(202), # capital E, circumflex accent
'È' => chr(200), # capital E, grave accent
'Ë' => chr(203), # capital E, dieresis or umlaut mark
'Í' => chr(205), # capital I, acute accent
'Î' => chr(206), # capital I, circumflex accent
'Ì' => chr(204), # capital I, grave accent
'Ï' => chr(207), # capital I, dieresis or umlaut mark
'Ñ' => chr(209), # capital N, tilde
'Ó' => chr(211), # capital O, acute accent
'Ô' => chr(212), # capital O, circumflex accent
'Ò' => chr(210), # capital O, grave accent
'Ø' => chr(216), # capital O, slash
'Õ' => chr(213), # capital O, tilde
'Ö' => chr(214), # capital O, dieresis or umlaut mark
'Þ' => chr(222), # capital THORN, Icelandic
'Ú' => chr(218), # capital U, acute accent
'Û' => chr(219), # capital U, circumflex accent
'Ù' => chr(217), # capital U, grave accent
'Ü' => chr(220), # capital U, dieresis or umlaut mark
'Ý' => chr(221), # capital Y, acute accent
'á' => chr(225), # small a, acute accent
'â' => chr(226), # small a, circumflex accent
'æ' => chr(230), # small ae diphthong (ligature)
'à' => chr(224), # small a, grave accent
'å' => chr(229), # small a, ring
'ã' => chr(227), # small a, tilde
'ä' => chr(228), # small a, dieresis or umlaut mark
'ç' => chr(231), # small c, cedilla
'é' => chr(233), # small e, acute accent
'ê' => chr(234), # small e, circumflex accent
'è' => chr(232), # small e, grave accent
'ð' => chr(240), # small eth, Icelandic
'ë' => chr(235), # small e, dieresis or umlaut mark
'í' => chr(237), # small i, acute accent
'î' => chr(238), # small i, circumflex accent
'ì' => chr(236), # small i, grave accent
'ï' => chr(239), # small i, dieresis or umlaut mark
'ñ' => chr(241), # small n, tilde
'ó' => chr(243), # small o, acute accent
'ô' => chr(244), # small o, circumflex accent
'ò' => chr(242), # small o, grave accent
'ø' => chr(248), # small o, slash
'õ' => chr(245), # small o, tilde
'ö' => chr(246), # small o, dieresis or umlaut mark
'ß' => chr(223), # small sharp s, German (sz ligature)
'þ' => chr(254), # small thorn, Icelandic
'ú' => chr(250), # small u, acute accent
'û' => chr(251), # small u, circumflex accent
'ù' => chr(249), # small u, grave accent
'ü' => chr(252), # small u, dieresis or umlaut mark
'ý' => chr(253), # small y, acute accent
'ÿ' => chr(255), # small y, dieresis or umlaut mark
# Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
'©' => chr(169), # copyright sign
'®' => chr(174), # registered sign
' ' => chr(160), # non breaking space
# Additional ISO-8859/1 entities listed in rfc1866 (section 14)
'¡' => chr(161),
'¢' => chr(162),
'£' => chr(163),
'¤' => chr(164),
'¥' => chr(165),
'¦' => chr(166),
'§' => chr(167),
'¨' => chr(168),
'ª' => chr(170),
'«' => chr(171),
'¬' => chr(172),
'­' => chr(173),
'¯' => chr(175),
'°' => chr(176),
'±' => chr(177),
'¹' => chr(185),
'²' => chr(178),
'³' => chr(179),
'´' => chr(180),
'µ' => chr(181),
'¶' => chr(182),
'·' => chr(183),
'¸' => chr(184),
'º' => chr(186),
'»' => chr(187),
'¼' => chr(188),
'½' => chr(189),
'¾' => chr(190),
'¿' => chr(191),
'×' => chr(215),
'÷' => chr(247),
'Œ' => chr(338),
'œ' => chr(339),
'Š' => chr(352),
'š' => chr(353),
'Ÿ' => chr(376),
'ƒ' => chr(402),
'ˆ' => chr(710),
'˜' => chr(732),
'Α' => chr(913),
'Β' => chr(914),
'Γ' => chr(915),
'Δ' => chr(916),
'Ε' => chr(917),
'Ζ' => chr(918),
'Η' => chr(919),
'Θ' => chr(920),
'Ι' => chr(921),
'Κ' => chr(922),
'Λ' => chr(923),
'Μ' => chr(924),
'Ν' => chr(925),
'Ξ' => chr(926),
'Ο' => chr(927),
'Π' => chr(928),
'Ρ' => chr(929),
'Σ' => chr(931),
'Τ' => chr(932),
'Υ' => chr(933),
'Φ' => chr(934),
'Χ' => chr(935),
'Ψ' => chr(936),
'Ω' => chr(937),
'α' => chr(945),
'β' => chr(946),
'γ' => chr(947),
'δ' => chr(948),
'ε' => chr(949),
'ζ' => chr(950),
'η' => chr(951),
'θ' => chr(952),
'ι' => chr(953),
'κ' => chr(954),
'λ' => chr(955),
'μ' => chr(956),
'ν' => chr(957),
'ξ' => chr(958),
'ο' => chr(959),
'π' => chr(960),
'ρ' => chr(961),
'ς' => chr(962),
'σ' => chr(963),
'τ' => chr(964),
'υ' => chr(965),
'φ' => chr(966),
'χ' => chr(967),
'ψ' => chr(968),
'ω' => chr(969),
'ϑ' => chr(977),
'ϒ' => chr(978),
'ϖ' => chr(982),
' ' => chr(8194),
' ' => chr(8195),
' ' => chr(8201),
'‌' => chr(8204),
'‍' => chr(8205),
'‎' => chr(8206),
'‏' => chr(8207),
'–' => chr(8211),
'—' => chr(8212),
'‘' => chr(8216),
'’' => chr(8217),
'‚' => chr(8218),
'“' => chr(8220),
'”' => chr(8221),
'„' => chr(8222),
'†' => chr(8224),
'‡' => chr(8225),
'•' => chr(8226),
'…' => chr(8230),
'‰' => chr(8240),
'′' => chr(8242),
'″' => chr(8243),
'‹' => chr(8249),
'›' => chr(8250),
'‾' => chr(8254),
'⁄' => chr(8260),
'€' => chr(8364),
'ℑ' => chr(8465),
'℘' => chr(8472),
'ℜ' => chr(8476),
'™' => chr(8482),
'ℵ' => chr(8501),
'←' => chr(8592),
'↑' => chr(8593),
'→' => chr(8594),
'↓' => chr(8595),
'↔' => chr(8596),
'↵' => chr(8629),
'⇐' => chr(8656),
'⇑' => chr(8657),
'⇒' => chr(8658),
'⇓' => chr(8659),
'⇔' => chr(8660),
'∀' => chr(8704),
'∂' => chr(8706),
'∃' => chr(8707),
'∅' => chr(8709),
'∇' => chr(8711),
'∈' => chr(8712),
'∉' => chr(8713),
'∋' => chr(8715),
'∏' => chr(8719),
'∑' => chr(8721),
'−' => chr(8722),
'∗' => chr(8727),
'√' => chr(8730),
'∝' => chr(8733),
'∞' => chr(8734),
'∠' => chr(8736),
'∧' => chr(8743),
'∨' => chr(8744),
'∩' => chr(8745),
'∪' => chr(8746),
'∫' => chr(8747),
'∴' => chr(8756),
'∼' => chr(8764),
'≅' => chr(8773),
'≈' => chr(8776),
'≠' => chr(8800),
'≡' => chr(8801),
'≤' => chr(8804),
'≥' => chr(8805),
'⊂' => chr(8834),
'⊃' => chr(8835),
'⊄' => chr(8836),
'⊆' => chr(8838),
'⊇' => chr(8839),
'⊕' => chr(8853),
'⊗' => chr(8855),
'⊥' => chr(8869),
'⋅' => chr(8901),
'⌈' => chr(8968),
'⌉' => chr(8969),
'⌊' => chr(8970),
'⌋' => chr(8971),
'〈' => chr(9001),
'〉' => chr(9002),
'◊' => chr(9674),
'♠' => chr(9824),
'♣' => chr(9827),
'♥' => chr(9829),
'♦' => chr(9830),
);
# Make the opposite mapping
my %char2entity = map { $entity2char{$_} => $_ } keys %entity2char;
# Fill in missing entities
#for (0 .. 255) {
# next if exists $char2entity{chr($_)};
# $char2entity{chr($_)} = "&#$_;";
#}
sub encode_entities {
my ( $string, $chars ) = @_;
my %escape;
if ( $chars ) {
%escape = map { $_ => 1 } split //, $chars;
}
else {
%escape = map { $_ => 1 } keys %char2entity;
}
my @string = split //, $string;
for my $i ( 0 .. $#string ) {
my $c = $string[$i];
if ( $escape{$c} and $c ne '&' and $c ne ';' ) {
$string[$i] = $char2entity{$c};
}
elsif ( $escape{$c} and $c eq '&' ) {
my $maybe_entity = '';
FIND_SEMI: for my $j ( $i .. $#string ) {
$maybe_entity .= $string[$j];
last FIND_SEMI if $string[$j] eq ';';
}
if ( not exists $entity2char{$maybe_entity} ) {
$string[$i] = $char2entity{$c};
}
}
elsif( $escape{$c} and $c eq ';' ) {
my $maybe_entity = '';
FIND_AMP: for ( my $j = $i; $j >= 0; $j-- ) {
$maybe_entity = $string[$j] . $maybe_entity;
last FIND_SEMI if $string[$j] eq '&';
}
if ( not exists $entity2char{$maybe_entity} ) {
$string[$i] = $char2entity{$c};
}
}
}
return join '', @string;
}
sub decode_entities {
my @results;
for my $string ( @_ ) {
my @string = split //, $string;
for my $i ( 0 .. $#string ) {
my $c = $string[$i];
if ( $c eq '&' ) {
my $maybe_entity = '';
my $length = 0;
FIND_SEMI: for my $j ( $i .. $#string ) {
$maybe_entity .= $string[$j];
last FIND_SEMI if $string[$j] eq ';';
$length++;
}
if ( exists $entity2char{$maybe_entity} ) {
$string[$i] = $entity2char{$maybe_entity};
splice( @string, $i + 1, $length );
}
}
}
push @results, join '', @string;
}
return wantarray ? @results : $results[0];
}
1;
__END__
=head1 NAME
Bio::Phylo::NeXML::Entities - Functions for dealing with XML entities
=head1 DESCRIPTION
This package provides subroutines for dealing with characters that need to be
encoded as XML entities, and decoded in other formats. For example: C<&> needs
to be encoded as C<&> in XML. The subroutines have the same signatures and
the same names as those in the commonly-used module L<HTML::Entities>. They are
re-implemented here to avoid introducing dependencies.
=head1 SUBROUTINES
The following subroutines are utility functions that can be imported using:
use Bio::Phylo::NeXML::Entities '/entities/';
=over
=item encode_entities
Encodes problematic characters as XML entities
Type : Utility function
Title : encode_entities
Usage : my $encoded = encode_entities('string with & or >','>&')
Function: Encodes entities in first argument string
Returns : Modified string
Args : Required, first argument: a string to encode
Optional, second argument: a string that specifies
which characters to encode
=item decode_entities
Decodes XML entities into the characters they code for
Type : Utility function
Title : decode_entities
Usage : my $decoded = decode_entities('string with & or >')
Function: decodes encoded entities in argument string(s)
Returns : Array of decoded strings
Args : One or more encoded strings
=back
=head1 SEE ALSO
There is a mailing list at L<https://groups.google.com/forum/#!forum/bio-phylo>
for any user or developer questions and discussions.
=over
=item L<Bio::Phylo::Manual>
Also see the manual: L<Bio::Phylo::Manual> and L<http://rutgervos.blogspot.com>.
=back
=head1 CITATION
If you use Bio::Phylo in published research, please cite it:
B<Rutger A Vos>, B<Jason Caravas>, B<Klaas Hartmann>, B<Mark A Jensen>
and B<Chase Miller>, 2011. Bio::Phylo - phyloinformatic analysis using Perl.
I<BMC Bioinformatics> B<12>:63.
L<http://dx.doi.org/10.1186/1471-2105-12-63>
=cut