The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package HTML::MobileConverter;
use strict;
use HTML::Parser;
use URI;

our $VERSION = '0.011';

sub new {
    my $class = shift;
    my $self = {@_};
    bless $self, $class;
    $self->init;
    return $self;
}

sub init {
    my $self = shift;
    $self->{maxlength} ||= 50000;
    $self->{maxpctagcount} ||= 10;
    $self->{maxpctagrate} ||= 0.1;
    $self->{baseuri} or warn 'baseuri is not specified.';
    $self->{hrefhandler} ||= sub {
        my $href = shift;
        return URI->new_abs($href, $self->{baseuri});
    };
    $self->{mobiletag} ||= {
        a => 'name|accesskey|href|cti|ijam|utn|subject|body|telbook|kana|email|ista|ilet|iswf|irst',
        base => 'href',
        blink => '',
        blockquote => '',
        body => 'bgcolor|text|link|alink|vlink',
        br => 'clear',
        center => '',
        dd => '',
        dir => 'type',
        div => 'align',
        dl => '',
        dt => '',
        font => 'color|size',
        form => 'action|method|utn',
        head => '',
        h1 => 'align',
        h2 => 'align',
        h3 => 'align',
        h4 => 'align',
        h5 => 'align',
        h6 => 'align',
        hr => 'align|size|width|noshade|color',
        html => '',
        img => 'src|align|width|height|hspace|vspace|alt',
        input => 'type|name|size|maxlength|accesskey|value|istyle|checked',
        li => 'type|value',
        marquee => 'behavior|direction|loop|height|width|scrollmount|scrolldelay|bgcolor',
        menu => 'type',
        meta => 'http\-equiv|content',
        object => 'declare|id|data|type|width|height',
        ol => 'type|start',
        option => 'selected|value',
        p => 'align',
        param => 'name|value|valuetype',
        plaintext => '',
        pre => '',
        select => 'name|size|multiple',
        textarea => 'name|accesskey|rows|cols|istyle',
        title => '',
        ul => 'type',
    };
    $self->{codetag} = 'script|style';
    $self->{ignoretag} = 'form|input|select|option|textarea';
    $self->{html} = '';
}

sub _initparser {
    my $self = shift;
    $self->{html2} = '';
    $self->{mhtml} = '';
    $self->{tagcount} = 0;
    $self->{mtagcount} = 0;
    $self->{ismobilecontent} = '';
    $self->{iscode} = 0;
    $self->{parser} = HTML::Parser->new(
        api_version => 3,
        handlers => {
            start => [$self->starthandler, 'text, tagname, attr'],
            end => [$self->endhandler, 'text, tagname'],
            text => [$self->texthandler, 'text'],
            default => [$self->defaulthandler, 'event, text'],
        },
    );
}

sub starthandler {
    my $self = shift;
    return sub {
        my ($text, $tag, $attr) = @_;
        $self->{tagcount}++;
        if ($tag =~ /^($self->{codetag})$/i) {
            $self->{iscode}++;
        }
        if (defined $self->{mobiletag}->{$tag}) {
            $self->{mtagcount}++;
            if ($tag =~ /^($self->{ignoretag})$/i) {
                return;
            } else {
                $self->{mhtml} .= $self->_makestartm($tag,$attr);
                $self->{html2} .= $self->_makestart2($tag,$attr);
            }
        }
    };
}

sub _makestartm {
    my $self = shift;
    my $tag = shift or return;
    my $attr = shift or return;
    if ($tag eq 'img') {
        my ($w,$h) = @$attr{'width', 'height'};
        unless ($w && $h && $w < 100 && $h < 100) {
            my $text = 'img:';
            $text .= $attr->{alt} || $attr->{title} || '';
            return $text;
        }
    }
    my $attrpat = $self->{mobiletag}->{$tag};
    my $text = qq|<$tag|;
    for my $key (keys %$attr) {
        if ($key =~ /^(href|action)$/) {
            $text .= qq| $key="| . 
                $self->{hrefhandler}($attr->{$key}) . '"';
        } elsif ($key eq 'src') {
            $text .= qq| $key="|. URI->new_abs($attr->{$key}, $self->{baseuri}) . '"';
        } elsif ($key =~ /^($attrpat)$/i) {
            $text .= qq| $key="$attr->{$key}"|;
        }
    }
    $text .= '>';
    return $text;
}

sub _makestart2 {
    my $self = shift;
    my $tag = shift or return;
    my $attr = shift or return;
    my $text = qq|<$tag|;
    for my $key (keys %$attr) {
        if ($key =~ /^(href|action)$/) {
            $text .= qq| $key="| . 
                $self->{hrefhandler}($attr->{$key}) . '"';
        } elsif ($key eq 'src') {
            $text .= qq| $key="|. URI->new_abs($attr->{$key}, $self->{baseuri}) . '"';
        } else {
            $text .= qq| $key="$attr->{$key}"|;
        }
    }
    $text .= '>';
    return $text;
}

sub endhandler {
    my $self = shift;
    return sub {
        my ($text, $tag) = @_;
        if ($tag =~ /^($self->{codetag})$/i) {
            $self->{iscode}--;
        }
        if (defined $self->{mobiletag}->{$tag}) {
            if ($tag =~ /^($self->{ignoretag})$/i) {
                $text = '';
            } else {
                $self->{mhtml} .= $text;
            }
            $self->{html2} .= $text;
        }
    };
}

sub texthandler {
    my $self = shift;
    return sub {
        my ($text) = @_;
        if (!$self->{iscode}) {
            $self->{mhtml} .= $text;
        }
        $self->{html2} .= $text;
    };
}

sub defaulthandler {
    my $self = shift;
    return sub {};
}

sub convert {
    my $self = shift;
    $self->{html} = shift or return;
    $self->_initparser;
    $self->{parser}->parse($self->{html});
    if ($self->_checkmobile) {
        return $self->{html2};
    } else {
        return $self->{mhtml};
    }
}

sub _checkmobile {
    my $self = shift;
    if ($self->{maxlength} && (length($self->{html}) > $self->{maxlength})) {
        return; # over max size
    } elsif (($self->{tagcount} - $self->{mtagcount}) > $self->{maxpctagcount}) {
        return; # includes many pc tags
    } elsif (!$self->{tagcount}) {
        # do nothing
    } elsif (($self->{mtagcount} / $self->{tagcount}) < (1 - $self->{maxpctagrate})) {
        return; # includes many pc tags
    }
    $self->{ismobilecontent} = 1;
    return $self->ismobilecontent;
}

sub ismobilecontent {
    my $self = shift;
    return $self->{ismobilecontent};
}

sub param { $_[0]->{$_[1]}; }

1;

__END__

=head1 NAME

HTML::MobileConverter - HTML Converter for mobile agent

=head1 SYNOPSIS

  use HTML::MobileConverter;

  my $baseuri = 'http://example.com/';
  my $c = HTML::MobileConverter->new(baseuri => $baseuri);
  my $html =<<END;
  <html><body>title<hr><a href="./my">my link</a></body></html>
  END
  print $c->convert($html); # get html with abs-uri.
  
  use URI;
  $html = <<END;
  <html><body>
  title<hr>
  <a href="./my">my link</a>
  <iframe src="./my"></iframe>
  </body></html>
  END
  $c = HTML::MobileConverter->new(
    baseuri => $baseuri,
    hrefhandler => sub {
      my $href = shift;
      return URI->new_abs($href, 'http://example.com/');
    },
  );
  print $c->convert($html); # get html without iframe.
  
  # create a proxy
  my $q = CGI->new;
  my $html = $c->convert(LWP::Simple:get($q->param('uri')));
  print Jcode->new($html)->sjis;

=head1 DESCRIPTION

HTML::MobileConverter parses HTML and returns new HTML for mobile agent 
(mainly for DoCoMo i-mode).
If the original HTML doesn't contain so many pc tags, it returns the original 
HTML strings with absolute uri (href,src...). If the original was guessed as 
a content for PC, it returns new HTML for mobile agent.

=head1 METHODS

Here are common methods of HTML::MobileConverter.

=over 4

=item new

  $c = HTML::MobileConverter->new;
  $c = HTML::MobileConverter->new(baseuri => 'http://www.example.com/');
  $c = HTML::MobileConverter->new(
    baseuri => 'http://www.example.com/',
    hrefhandler => sub {
      my $href = shift;
      $href = URI->new_abs($href, 'http://www.example.com/');
      return qq|/browse?uri=$href|;
    },
  );

creates a instance of HTML::MobileConverter. If you specify C<baseuri>, 
C<href/src/action> attributes will be replaced with absolute uris.

If you specify C<hrefhandler> with some function, href attribute will
be handled with the handler.

=item convert

  my $mhtml = $c->convert($html);

returns HTML strings for mobile.

=item ismobilecontent

  print "is mobile" if $c->ismobilecontent;

returns which the original HTML was guessed as mobile content or not.

=back

=head1 AUTHOR

Junya Kondo, E<lt>jkondo@hatena.ne.jpE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2005 by Junya Kondo

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=head1 SEE ALSO

L<HTML::Parser>
http://www.nttdocomo.co.jp/p_s/imode/tag/lineup.html (Japanese)

=cut