The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Apache::GuessCharset;

use strict;
our $VERSION = 0.03;
our $DEBUG   = 0;

use Apache::Constants;
use Apache::File;
use Encode::Guess;
use I18N::Charset;

# generated from http://www.iana.org/assignments/character-sets
# '/Name: (\S*)/ and $name = $1; /Alias: (\S*).*preferred MIME/ and print qq("$name" => "$1",)'
our %Prefered_MIME = (
    "ANSI_X3.4-1968" => "US-ASCII",
    "ISO_8859-1:1987" => "ISO-8859-1",
    "ISO_8859-2:1987" => "ISO-8859-2",
    "ISO_8859-3:1988" => "ISO-8859-3",
    "ISO_8859-4:1988" => "ISO-8859-4",
    "ISO_8859-6:1987" => "ISO-8859-6",
    "ISO_8859-6-E" => "ISO-8859-6-E",
    "ISO_8859-6-I" => "ISO-8859-6-I",
    "ISO_8859-7:1987" => "ISO-8859-7",
    "ISO_8859-8:1988" => "ISO-8859-8",
    "ISO_8859-8-E" => "ISO-8859-8-E",
    "ISO_8859-8-I" => "ISO-8859-8-I",
    "ISO_8859-5:1988" => "ISO-8859-5",
    "ISO_8859-9:1989" => "ISO-8859-9",
    "Extended_UNIX_Code_Packed_Format_for_Japanese" => "EUC-JP",
);

sub handler {
    my $r = shift;
    return DECLINED if
	! $r->is_main                  or
	$r->content_type !~ m@^text/@  or
	$r->content_type =~ /charset=/ or
	! -e $r->finfo                 or
	-d _                           or
	!(my $chunk = read_chunk($r));

    my @suspects = $r->dir_config->get('GuessCharsetSuspects');
    my $enc  = guess_encoding($chunk, @suspects);
    unless (ref $enc) {
	warn "Couldn't guess encoding: $enc" if $DEBUG;
	return DECLINED;
    }

    my $iana    = iana_charset_name($enc->name);
    my $charset = lc($Prefered_MIME{$iana} || $iana); # lowercased
    warn "Guessed: $charset" if $DEBUG;
    $r->content_type($r->content_type . "; charset=$charset");
    return OK;
}

sub read_chunk {
    my $r  = shift;
    my $fh = Apache::File->new($r->filename) or return;
    my $buffer_size = $r->dir_config('GuessCharsetBufferSize') || 512;
    read $fh, my($chunk), $buffer_size;
    return $chunk;
}

1;
__END__

=head1 NAME

Apache::GuessCharset - adds HTTP charset by guessing file's encoding

=head1 SYNOPSIS

  SetHandler perl-script
  PerlFixupHandler +Apache::GuessCharset

  # how many bytes to read for guessing (default 512)
  PerlSetVar GuessCharsetBufferSize 1024

  # list of encoding suspects
  PerlSetVar GuessCharsetSuspects euc-jp
  PerlAddVar GuessCharsetSuspects shiftjis
  PerlAddVar GuessCharsetSuspects 7bit-jis

=head1 DESCRIPTION

Apache::GuessCharset is an Apache fix-up handler which adds HTTP
charset attribute by automaticaly guessing text files' encodings via
Encode::Guess.

=head1 CONFIGURATION

This module uses following configuration variables.

=over 4

=item GuessCharsetSuspects

a list of encodings for C<Encode::Guess> to check. See
L<Encode::Guess> for details.

=item GuessCharsetBufferSize

specifies how many bytes for this module to read from source file, to
properly guess encodings. default is 512.

=back

=head1 AUTHOR

Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=head1 SEE ALSO

L<Encode::Guess>, L<Apache::File>

=cut