package HTML::EscapeEvil;
use strict;
use base qw(HTML::Filter Class::Accessor);
use HTML::Element;
use Carp;
our ( $ENTITY_REGEXP, %JS_EVENT, $VERSION );
__PACKAGE__->mk_accessors(
qw(allow_comment allow_declaration allow_process allow_entity_reference allow_script allow_style collection_process)
);
__PACKAGE__->mk_ro_accessors(qw(processes));
BEGIN {
$VERSION = 0.05;
my @allow_entity_references =
( "amp", "lt", "gt", "quot", "apos", "#039", "nbsp", "copy", "reg" );
$ENTITY_REGEXP = "&(" . ( join "|", @allow_entity_references ) . ")(;)";
# ============================================================================= #
# allow javascript event handler setting
# cite : javascript in href. e.g. <a href="javascript:alert('hello')">hello</a>
# ============================================================================= #
%JS_EVENT = (
cite => 0,
onblur => 0,
onchange => 0,
onclick => 0,
ondblclick => 0,
onerror => 0,
onfocus => 0,
onkeydown => 0,
onkeypress => 0,
onkeyup => 0,
onload => 0,
onmousedown => 0,
onmousemove => 0,
onmouseout => 0,
onmouseover => 0,
onmouseup => 0,
onreset => 0,
onselect => 0,
onsubmit => 0,
onunload => 0,
);
}
sub new {
my ( $class, %args ) = @_;
my $self = $class->SUPER::new;
foreach (
qw(allow_comment allow_declaration allow_process allow_style allow_script collection_process)
)
{
$self->{$_} = ( $args{$_} ) ? 1 : 0;
}
if ( $args{allow_entity_reference} ne ""
&& $args{allow_entity_reference} == 0 )
{
$self->{allow_entity_reference} = 0;
}
else {
$self->{allow_entity_reference} = 1;
}
$self->{processes} = [];
$self->{_content} = [];
$self->{_allow_tags} = {};
bless $self, ref $class || $class;
if ( $args{allow_tags} ) {
$self->add_allow_tags( ( ref( $args{allow_tags} ) eq "ARRAY" )
? @{ $args{allow_tags} }
: $args{allow_tags} );
}
return $self;
}
sub set_allow_tags {
my $self = shift;
$self->{_allow_tags} = {};
$self->{_current_tag} = undef;
$self->{allow_script} = 0;
$self->{allow_style} = 0;
$self->{allow_comment} = 0;
$self->{allow_declaration} = 0;
$self->{allow_process} = 0;
$self->{allow_entity_reference} = 1;
$self->{collection_process} = 0;
$self->clear_content;
$self->clear_process;
$self->add_allow_tags(@_);
}
sub add_allow_tags {
my ( $self, @tags ) = @_;
foreach my $tag (@tags) {
$tag = lc $tag;
if ( $tag eq "script" || $tag eq "style" ) {
$self->{"allow_$tag"} = 1;
next;
}
$self->{_allow_tags}->{$tag} = 1;
}
}
sub deny_tags {
my ( $self, @tags ) = @_;
foreach my $tag (@tags) {
$tag = lc $tag;
if ( $tag eq "script" || $tag eq "style" ) {
$self->{"allow_$tag"} = 0;
next;
}
delete $self->{_allow_tags}->{$tag};
}
}
sub get_allow_tags {
my $self = shift;
my @tags = keys %{ $self->{_allow_tags} };
push @tags, "script" if $self->{allow_script};
push @tags, "style" if $self->{allow_style};
return sort { $a cmp $b } @tags;
}
sub is_allow_tags {
my ( $self, $tag ) = @_;
my $flag;
$tag = lc $tag;
if ( $tag eq "script" || $tag eq "style" ) {
$flag = $self->{"allow_$tag"};
}
else {
$flag = ( exists $self->{_allow_tags}->{$tag} ) ? 1 : 0;
}
return ($flag) ? 1 : 0;
}
sub deny_all {
my $self = shift;
$self->{_allow_tags} = {};
$self->{_current_tag} = undef;
$self->{allow_script} = 0;
$self->{allow_style} = 0;
$self->{allow_comment} = 0;
$self->{allow_declaration} = 0;
$self->{allow_process} = 0;
$self->{allow_entity_reference} = 0;
}
sub filtered_html {
my $self = shift;
my $content = join "", @{ $self->{_content} };
$self->clear_content;
return $content;
}
sub filtered_file {
my $self = shift;
my $fh;
( ref( $_[0] ) eq "GLOB" || ref( \$_[0] ) eq "GLOB" )
? ( $fh = $_[0] )
: ( open $fh, "> $_[0]" or croak($!) );
print $fh $self->filtered_html;
close $fh;
}
sub filtered {
my $self = shift;
my $content;
if ( -e $_[0] || ref( $_[0] ) eq "GLOB" || ref( \$_[0] ) eq "GLOB" ) {
$self->parse_file( $_[0] );
}
elsif ( $_[0] ne "" ) {
$self->parse( $_[0] );
}
else {
croak("content is empty");
}
if ( $_[1] ) {
$self->filtered_file( $_[1] );
$content = 1;
}
else {
$content = $self->filtered_html;
}
$self->eof;
$self->{_current_tag} = undef;
return $content;
}
sub clear {
my $self = shift;
$self->eof;
$self->clear_process;
$self->clear_content;
$self->{_current_tag} = undef;
}
sub clear_content {
my $self = shift;
$self->{_content} = [] if scalar @{ $self->{_content} };
}
sub clear_process {
my $self = shift;
$self->{processes} = [] if scalar @{ $self->{processes} };
}
sub DESTROY {
my $self = shift;
$self->clear;
}
sub _escape {
my $string = shift;
$string =~ s/&/&/g;
$string =~ s/</</g;
$string =~ s/>/>/g;
$string =~ s/\"/"/g;
$string =~ s/\'/'/g;
return $string;
}
sub _unescape {
my $string = shift;
$string =~ s/&/&/g;
$string =~ s/</</g;
$string =~ s/>/>/g;
$string =~ s/"/\"/g;
$string =~ s/'/\'/g;
$string =~ s/'/\'/g;
return $string;
}
sub _unescape_entities {
my $string = shift;
$string =~ s/$ENTITY_REGEXP/\&$1$2/g;
return $string;
}
# ============================== override method start ============================== #
sub declaration {
my ( $self, $declaration ) = @_;
$declaration = "<!$declaration>";
$self->output( ( $self->{allow_declaration} )
? $declaration
: &_escape($declaration) );
}
sub process {
my ( $self, $process, $process_text ) = @_;
if ( $self->{collection_process} ) {
my $tmp_process = $process;
$tmp_process =~ s/\?$//;
push @{ $self->{processes} }, $tmp_process;
}
$self->SUPER::process( $process,
( $self->{allow_process} ) ? $process_text : &_escape($process_text) );
}
sub start {
my ( $self, $tagname, $attr, $attrseq, $text ) = @_;
$self->{_current_tag} = lc $tagname;
if ( $self->is_allow_tags($tagname) ) {
if ( !$self->allow_script ) {
## change javascript event handler(1 : allow) e.g <body onload="alert(1)"> => <body onload="void(0)">
foreach ( keys %{$attr} ) {
my $event = lc $_;
if ( exists $JS_EVENT{$event} && !$JS_EVENT{$event} ) {
#delete $attr->{$event};
$attr->{$event} = "void(0)";
}
}
## change javascript <a href="javascript:evil_script('evil')"> => <a href="javascript:void(0)">
if ( !$JS_EVENT{cite} && $attr->{href} =~ /^(java|vb)script:/i ) {
$attr->{href} = "javascript:void(0)";
}
## tag is generated again
my $element = HTML::Element->new( $tagname, %{$attr} );
$text = $element->starttag;
$element->delete;
$element = undef;
}
}
else {
$text = &_escape($text);
}
$self->SUPER::start( $tagname, $attr, $attrseq, $text );
}
sub end {
my ( $self, $tagname, $text ) = @_;
$self->{_current_tag} = undef;
$text = &_escape($text) if !$self->is_allow_tags($tagname);
$self->SUPER::end( $tagname, $text );
}
sub comment {
my ( $self, $comment ) = @_;
$comment = "<!--$comment-->";
$self->output( ( $self->{allow_comment} ) ? $comment : &_escape($comment) );
}
sub text {
my ( $self, $text, $is_cdata ) = @_;
$text = &_escape($text);
$text = &_unescape_entities($text) if $self->{allow_entity_reference};
$text = &_unescape($text)
if $is_cdata
&& $self->{_current_tag} eq "script"
&& $self->{allow_script};
$text = &_unescape($text)
if $is_cdata && $self->{_current_tag} eq "style" && $self->{allow_style};
$self->SUPER::text( $text, $is_cdata );
}
sub output {
my ( $self, $content ) = @_;
push @{ $self->{_content} }, $content;
}
1;
__END__
=head1 NAME
HTML::EscapeEvil - Escape tag
=head1 VERSION
0.05
=head1 SYNPSIS
use HTML::EscapeEvil;
my $escapeevil = HTML::EscapeEvil->new;
my $evil_html = <<HTML;
<script type="text/javascript">
<!--
alert("script is evil tags!!");
//-->
</script>
<iflame src="deny.html" width="100" height="100"></iframe>
HTML
$escapeevil->parse($html); #from string
$escapeevil->parse_file($html_file); #from file or file handle
my $clean_html = $escapeevil->filtered_html;
$escapeevil->clear;
=head1 DESCRIPTION
The tag that doesn't want to permit escapes all.
=head1 METHOD
=head2 new
create instance
Example :
my $escapeevil = HTML::EscapeEvil->new(
allow_comment => 1,
allow_declaration => 0,
allow_process => 0,
allow_tags => [qw(a l l o w t a g s)],
#allow_tags => "one",# OK
);
Option :
allow_comment : allow comment. default 0.
allow_declaration : allow_declaration. default 0.
allow_process : allow_process. default 0.
allow_tags : set allow tags
allow_script : allow script tag. default 0(is_allow_tags("script") OK)
allow_style : allow style tag. default 0(is_allow_tags("style") OK)
allow_entity_reference : allow entity reference. default 1
collection_process : collection process. default 0
When tag is not specified for allow_tags, default makes all tag invalid.
=head2 set_allow_tags
The setting is returned to default.
Example :
$escapeevil->set_allow_tags(qw(t a g s));
=head2 add_allow_tags
The tag that wants to permit is added.
Example :
$escapeevil->add_allow_tags(qw(t a g s));
=head2 deny_tags
The specified tag is not permitted.
Example :
$escapeevil->deny_tags(qw(t a g s));
=head2 get_allow_tags
The list of the tag that has been permitted is returned.
Example :
my @list = $escapeevil->get_allow_tags;
=head2 is_allow_tags
Whether it is tag that has been permitted is checked.
Example :
print 'script is ', ($escapeevil->is_allow_tags('script')) ? 'allowed' : 'not allowed';
=head2 deny_all
No permission of all
Example :
$escapeevil->deny_all;
=head2 allow_comment
Whether the comment has been permitted is checked. Or, the setting change of the comment permission.
Example :
print 'comment is ', ($escapeevil->allow_comment) ? 'allowed' : 'not allowed';
$escapeevil->allow_comment(1); ## allow comment!
=head2 allow_declaration
Whether the DOCTYPE declaration has been permitted is checked. Or, the setting change of the DOCTYPE declaration permission.
Example :
print 'declaration is ', ($escapeevil->allow_declaration) ? 'allowed' : 'not allowed';
$escapeevil->allow_declaration(1); ## allow declaration!
=head2 allow_process
Whether the processing instruction has been permitted is checked. Or, the setting change of the processing instruction.
Example :
print 'process is ', ($escapeevil->allow_process) ? 'allowed' : 'not allowed';
$escapeevil->allow_process(1); ## allow process!
=head2 allow_entity_reference
Whether the substance reference has been permitted is checked. Or, the setting change of the substance reference.
Example :
print 'entity_reference is ', ($escapeevil->allow_entity_reference) ? 'allowed' : 'not allowed';
$escapeevil->allow_entity_reference(1); ## allow entity_reference!
=head2 allow_script
Whether it permits is checked script tag. Or, the setting change of script tag.
Example :
print 'script is ', ($escapeevil->allow_script) ? 'allowed' : 'not allowed';
$escapeevil->allow_script(1); ## allow script!
=head2 allow_style
Whether it permits is checked style tag. Or, the setting change of style tag.
Example :
print 'style is ', ($escapeevil->allow_style) ? 'allowed' : 'not allowed';
$escapeevil->allow_style(1); ## allow style!
=head2 collection_process
The setting change whether to collect process is done. Or, a present setting is acquired.
Example :
print 'collection_process is ', ($escapeevil->collection_process) ? 'collection' : 'no collection';
$escapeevil->collection_process(1); ##colloction process!
=head2 processes
The reference of the array of the processing instruction list is acquired. (reading exclusive use)
Example :
foreach(@{$escapeevil->processes}){
my $process = $_;
#example: eval $process ,system $process etc..
}
=head2 filtered_html
HTML that escapes in the tag not permitted is returned.
Example :
print $escapeevil->filetered_html;
=head2 filtered_file
HTML that escapes in the tag not permitted is written file.
Example :
(e.g.1)
$escapeevil->filtered_file("./filtered_file.html");
(e.g.2)
$escapeevil->filtered_file(*FILEHANDLE);
=head2 filtered
version 0.02 new method. parse(parse_file) and filtered_html(filtered_file) and eof,clear_process do.
Example :
my $html = "<script type=\"text/javascript\"><!--alert(\"hello!\");//--></script>";
(e.g.1)
my $cleanhtml = $escapeevil->filtered($html);
(e.g.2)
$escapeevil->filtered($html,"writefile.html");
(e.g.3)
open FILEHANDLE,"< evil.html" or die $!;
$escapeevil->filtered(*FILEHANDLE,"writefile.html");
=head2 clear_process
Collected process is annulled.
Example :
$escapeevil->clear_process;
=head2 clear
Initialization of variable that liberates of HTML::Parser object and is internal. Please execute it when processing is completed.
Example :
$escapeevil->clear;
=head1 NEW OPTION
VERSION 0.03.Javascript of event handler becomes invalid at allow_script(0) though event handler of javascript is defined in the tag that has been permitted, too.
Example :
<a href="javascript:alert(1234)">hello</a> => <a href="javascript:void(0)">hello</a>
<body onload="alert(5678)"> => <body onload="void(0)">
The definition of event handler is described in %HTML::Escape::JS_EVENT.
=head1 CAUTION
Please filtered_file must specify passing the file and specify the correct one. Die is executed when there are neither passing nor a writing authority that cannot be.
Processes is a method only for reading. When the value is set, die is done.
Carp http://search.cpan.org/~nwclark/perl-5.8.8/lib/Carp.pm
Class::Accessor http://search.cpan.org/~kasei/Class-Accessor-0.22/lib/Class/Accessor.pm
HTML::Element http://search.cpan.org/~petdance/HTML-Tree-3.1901/lib/HTML/Element.pm
HTML::Filter http://search.cpan.org/~gaas/HTML-Parser-3.46/lib/HTML/Filter.pm
HTML::Parser http://search.cpan.org/~gaas/HTML-Parser-3.46/Parser.pm
=head1 SEE ALSO
L<Carp> L<Class::Accessor> L<HTML::Element> L<HTML::Filter> L<HTML::Parser>
=head1 AUTHOR
Akira Horimoto <kurt0027@gmail.com>
=head1 COPYRIGHT
Copyright (C) 2006 Akira Horimoto
This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
=cut