The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
##---------------------------------------------------------------------------##
##  File:
##	$Id: CharEnt.pm,v 1.17 2010/12/31 18:23:02 ehood Exp $
##  Author:
##      Earl Hood       earl@earlhood.com
##  Description:
##	POD after __END__
##---------------------------------------------------------------------------##
##    Copyright (C) 1997-2002	Earl Hood, earl@earlhood.com
##
##    This program is free software; you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation; either version 2 of the License, or
##    (at your option) any later version.
##
##    This program is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License
##    along with this program; if not, write to the Free Software
##    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
##    02111-1307, USA
##---------------------------------------------------------------------------##

package MHonArc::CharEnt;

use strict;
use MHonArc::CharMaps;
use MHonArc::Char;

##---------------------------------------------------------------------------
##      Charset specification to mapping
##---------------------------------------------------------------------------
##  NOTE: The mapping uses a single name for a charset.
##	  The CHARSETALIASES resource can be used to map aka names (aliases)
##	  to the names used here.
##  NOTE: UTF-8 does not require a map since UTF-8 is decoded straight
##	  to &#xHHHH; entity references.
##  NOTE: iso-2022-{jp,kr} are translated to euc-{jp,kr} first before
##	  conversion.

my %CharsetMaps = (
    'iso-8859-1'     =>	'MHonArc/CharEnt/ISO8859_1.pm',
    'iso-8859-2'     =>	'MHonArc/CharEnt/ISO8859_2.pm',
    'iso-8859-3'     =>	'MHonArc/CharEnt/ISO8859_3.pm',
    'iso-8859-4'     =>	'MHonArc/CharEnt/ISO8859_4.pm',
    'iso-8859-5'     =>	'MHonArc/CharEnt/ISO8859_5.pm',
    'iso-8859-6'     =>	'MHonArc/CharEnt/ISO8859_6.pm',
    'iso-8859-7'     =>	'MHonArc/CharEnt/ISO8859_7.pm',
    'iso-8859-8'     =>	'MHonArc/CharEnt/ISO8859_8.pm',
    'iso-8859-9'     =>	'MHonArc/CharEnt/ISO8859_9.pm',
    'iso-8859-10'    =>	'MHonArc/CharEnt/ISO8859_10.pm',
    'iso-8859-11'    =>	'MHonArc/CharEnt/ISO8859_11.pm',
    'iso-8859-13'    =>	'MHonArc/CharEnt/ISO8859_13.pm',
    'iso-8859-14'    =>	'MHonArc/CharEnt/ISO8859_14.pm',
    'iso-8859-15'    =>	'MHonArc/CharEnt/ISO8859_15.pm',
    'iso-8859-16'    =>	'MHonArc/CharEnt/ISO8859_16.pm',
    'cp866'	     =>	'MHonArc/CharEnt/CP866.pm',
    'cp949'	     =>	'MHonArc/CharEnt/CP949.pm', # euc-kr
    'cp932'	     =>	'MHonArc/CharEnt/CP932.pm', # shiftjis
    'cp936'	     =>	'MHonArc/CharEnt/CP936.pm', # GBK
    'cp950'	     =>	'MHonArc/CharEnt/CP950.pm',
    'cp1250'	     =>	'MHonArc/CharEnt/CP1250.pm',
    'cp1251'	     =>	'MHonArc/CharEnt/CP1251.pm',
    'cp1252'	     =>	'MHonArc/CharEnt/CP1252.pm',
    'cp1253'	     =>	'MHonArc/CharEnt/CP1253.pm',
    'cp1254'	     =>	'MHonArc/CharEnt/CP1254.pm',
    'cp1255'	     =>	'MHonArc/CharEnt/CP1255.pm',
    'cp1256'	     =>	'MHonArc/CharEnt/CP1256.pm',
    'cp1257'	     =>	'MHonArc/CharEnt/CP1257.pm',
    'cp1258'	     =>	'MHonArc/CharEnt/CP1258.pm',
    'koi-0'	     =>	'MHonArc/CharEnt/KOI_0.pm',
    'koi-7'	     =>	'MHonArc/CharEnt/KOI_7.pm',
    'koi8-a'	     =>	'MHonArc/CharEnt/KOI8_A.pm',
    'koi8-b'	     =>	'MHonArc/CharEnt/KOI8_B.pm',
    'koi8-e'	     =>	'MHonArc/CharEnt/KOI8_E.pm',
    'koi8-f'	     =>	'MHonArc/CharEnt/KOI8_F.pm',
    'koi8-r'	     =>	'MHonArc/CharEnt/KOI8_R.pm',
    'koi8-u'	     =>	'MHonArc/CharEnt/KOI8_U.pm',
    'gost19768-87'   =>	'MHonArc/CharEnt/GOST19768_87.pm',
    'viscii'	     =>	'MHonArc/CharEnt/VISCII.pm',
    'macarabic'	     =>	'MHonArc/CharEnt/AppleArabic.pm',
    'maccentraleurroman' => 'MHonArc/CharEnt/AppleCenteuro.pm',
    'maccroatian'    =>	'MHonArc/CharEnt/AppleCroatian.pm',
    'maccyrillic'    =>	'MHonArc/CharEnt/AppleCyrillic.pm',
    'macgreek'	     =>	'MHonArc/CharEnt/AppleGreek.pm',
    'machebrew'	     =>	'MHonArc/CharEnt/AppleHebrew.pm',
    'macicelandic'   =>	'MHonArc/CharEnt/AppleIceland.pm',
    'macromanian'    =>	'MHonArc/CharEnt/AppleRomanian.pm',
    'macroman'	     =>	'MHonArc/CharEnt/AppleRoman.pm',
    'macthai'	     =>	'MHonArc/CharEnt/AppleThai.pm',
    'macturkish'     =>	'MHonArc/CharEnt/AppleTurkish.pm',
    'big5-eten'      =>	'MHonArc/CharEnt/BIG5_ETEN.pm',
    'big5-hkscs'     =>	'MHonArc/CharEnt/BIG5_HKSCS.pm',
    'gb2312'         =>	'MHonArc/CharEnt/GB2312.pm',
    'euc-jp'         =>	'MHonArc/CharEnt/EUC_JP.pm',
    'hp-roman8'      =>	'MHonArc/CharEnt/HP_ROMAN8.pm',
);

my $char_maps = MHonArc::CharMaps->new(\%CharsetMaps);

###############################################################################
##	Routines
###############################################################################

sub str2sgml {
    my $data 	 =    shift;
    my $charset  = lc shift;

    my $data_r  = ref($data) ? $data : \$data;
    $charset =~ tr/_/-/;

    # UTF-8 can be converted algorithmically.
    if ($charset eq 'utf-8') {
	_utf8_to_sgml($data_r);
	return $$data_r;
    }
    # If us-ascii, use simple s/// operation.
    if ($charset eq 'us-ascii') {
	$$data_r =~ s/([$HTMLSpecials])/$HTMLSpecials{$1}/go;
	return $$data_r;
    }

    MHonArc::Char::map_conv($data_r, $charset, $char_maps, \%HTMLSpecials);
}

##---------------------------------------------------------------------------##
##  Private Routines.

# Array of masks for lead byte in UTF-8 (for Perl <5.6)
# This could be computed on-the-fly, but using an array is faster
my @utf8_lb_mask = (
    0x3F, 0x1F, 0xF, 0x7, 0x3, 0x1  # 1, 2, 3, 4, 5, 6 bytes, respectively
);
# Regex pattern for UTF-8 data
my $utf8_re = q/([\x00-\x7F]|
		 [\xC0-\xDF][\x80-\xBF]|
		  \xE0      [\xA0-\xBF][\x80-\xBF]|
		 [\xE1-\xEF][\x80-\xBF]{2}|
		  \xF0      [\x90-\xBF][\x80-\xBF]{2}|
		 [\xF1-\xF7][\x80-\xBF]{3}|
		  \xF8      [\x88-\xBF][\x80-\xBF]{3}|
		 [\xF9-\xFB][\x80-\xBF]{4}|
		  \xFC      [\x84-\xBF][\x80-\xBF]{4}|
		  \xFD      [\x80-\xBF]{5}|
		 .)/;
# A lax regex for UTF-8 data.  Used for utf-8-aware perl since perl
# will validate sequences
my $utf8_re_lax =
	      q/([\x00-\x7F]|
		 [\xC0-\xDF][\x00-\xFF]|
		 [\xE0-\xEF][\x00-\xFF]{2}|
		 [\xF0-\xF7][\x00-\xFF]{3}|
		 [\xF8-\xFB][\x00-\xFF]{4}|
		 [\xFC-\xFD][\x00-\xFF]{5}|
		 .)/;

sub _utf8_to_sgml {
    my $data_r = shift;

    if ($] >= 5.006) {
	# UTF-8-aware perl
	# Have to enable warnings to get stricter utf-8 checks for Perl 5.8
	use warnings;
	my($char, $ord, $malformed);

	# Define local warn handle to suppress malformed utf-8 warning
	# messages and to flag when such occurrences happen.
	my $cur_sig_warn = $SIG{__WARN__};
	local $SIG{__WARN__} = sub {
	    $malformed = 1;
	    #warn @_;
	    # invoke current warn handler, if defined
	    &$cur_sig_warn  if defined($cur_sig_warn) && ref($cur_sig_warn);
	};
	$$data_r =~ s{
	    $utf8_re_lax
	}{
            # Bug #26577: Perl 5.10 changed unpack behavior
	    $char = ($] >= 5.010)? unpack('C0U*',$1): unpack('U0U*',$1);
	    if ($malformed ||
		  (($char & 0xFFFE) == 0xFFFE) ||
		  (($char & 0xFFFF) == 0xFFFF) ||
		  ($char >= 0xFDD0 && $char <= 0xFDEF) ||
		  ($char >= 0xD800 && $char <= 0xDFFF)
	       ) {
		# Some of the if() checks may be handled by perl directly,
		# but such checks can be disabled when perl is built.
		$malformed = 0;
		'&#xFFFD;';
	    } else {
		($char <= 0x7F)
			? $HTMLSpecials{$1} || sprintf('%c',$char)
			: sprintf('&#x%X;',$char);
	    }
	}gxeso;

    } else {
	# non-UTF-8-aware perl
	my($i, $n, $char);
	$$data_r =~ s{
	    $utf8_re
	}{
	    if (($n = length($1)) == 1) {
		my $ord = ord($1);
		if ($ord > 0x7F) {
		    # Malformed sequence
		    '&#xFFFD;';
		} else {
		    # 7-bit ASCII
		    $HTMLSpecials{$1} || $1;
		}
	    } else {
		# Multi-byte sequence
		$char = (unpack('C',substr($1,0,1)) &
			 $utf8_lb_mask[$n-1]) << ($n-1)*6;
		for ($i=1; $i < $n; ++$i) {
		    $char |= ((unpack('C',substr($1,$i,1)) & 0x3F) <<
			     (($n-$i-1)*6));
		}
		if ($char <= 0x7F    ||	# should only be single byte sequence
		    (($char & 0xFFFE) == 0xFFFE) ||	    # not a char
		    (($char & 0xFFFF) == 0xFFFF) ||	    # not a char
		    ($char >= 0xFDD0 && $char <= 0xFDEF) || # not a char
		    ($char >= 0xD800 && $char <= 0xDFFF)    # surrogates
		   ) {
		    '&#xFFFD;';
		} else {
		    sprintf('&#x%X;',$char);
		}
	    }
       }gxseo;
    }
}

##---------------------------------------------------------------------------##
1;
__END__

=head1 NAME

MHonArc::CharEnt - HTML Character routines for MHonArc.

=head1 SYNOPSIS

  use MHonArc::CharEnt;

  MHonArc resource file:

    <CharsetConverters>
    ...
    iso-8859-15;    MHonArc::CharEnt::str2sgml;     MHonArc/CharEnt.pm
    ...
    </CharsetConverters>

=head1 DESCRIPTION

MHonArc::CharEnt provides the main character conversion routine
used by MHonArc for converting non-ASCII encoded message header data
and text/plain character data into HTML.  This module was initially
written to just support 8-bit only charsets.  However, it has been
extended to support multibyte charsets.

All characters are mapped to HTML 4.0 character entity references
(e.g. &lt; &gt;) or to Unicode numeric character entity references
(e.g. &#x203E;).  Most modern browsers will support the Unicode
references directly.

=head1 NOTES

=over

=item *

This module relies on MHonArc's CHARSETALIASES resource for defining
alternate names for charset supported.

=item *

Most character conversion is done through mapping tables that
are dynamicly loaded on a as-needed basis.  There is probably
room for optimization by trying to replace tables for charsets
with algorithmic conversion solutions.

UTF-8 conversion is done algorithmically.

=item *

A main goal of this module is to convert raw non-ASCII data of
various character sets to ASCII data using entity references for
non-ASCII characters.  This way, archive files will all be in ASCII,
with modern compliant HTML browsers being able to handle the rendering
of non-ASCII characters from the standard named and numeric character
entity references.

This does make reading the raw HTML source for non-English languages
difficult, but this may be a non-issue with most users.

=back

=head1 VERSION

$Id: CharEnt.pm,v 1.17 2010/12/31 18:23:02 ehood Exp $

=head1 AUTHOR

Earl Hood, earl@earlhood.com

MHonArc comes with ABSOLUTELY NO WARRANTY and MHonArc may be copied only
under the terms of the GNU General Public License, which may be found in
the MHonArc distribution.

=cut