package HTML::Packer;
use 5.008009;
use strict;
use warnings;
use Carp;
use Regexp::RegGrp;
# -----------------------------------------------------------------------------
our $VERSION = '2.00';
our @BOOLEAN_ACCESSORS = (
'remove_comments',
'remove_newlines',
'no_compress_comment',
'html5',
);
our @JAVASCRIPT_OPTS = ( 'clean', 'obfuscate', 'shrink', 'best' );
our @CSS_OPTS = ( 'minify', 'pretty' );
our $REQUIRED_JAVASCRIPT_PACKER = '1.002001';
our $REQUIRED_CSS_PACKER = '1.000001';
our @SAVE_SPACE_ELEMENTS = (
'a', 'abbr', 'acronym', 'address', 'b', 'bdo', 'big', 'button', 'cite',
'del', 'dfn', 'em', 'font', 'i', 'input', 'ins', 'kbd', 'label', 'q',
's', 'samp', 'select', 'small', 'strike', 'strong', 'sub', 'sup', 'u', 'var'
);
our @VOID_ELEMENTS = (
'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input',
'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
);
# Some regular expressions are from HTML::Clean
our $COMMENT = '((?>\s*))(<!--(?:(?![#\[]| google_ad_section_).*?)?-->)((?>\s*))';
our $PACKER_COMMENT = '<!--\s*HTML::Packer\s*(\w+)\s*-->';
our $DOCTYPE = '<\!DOCTYPE[^>]*>';
our $DONT_CLEAN = '(<\s*(pre|code|textarea|script|style)[^>]*>)(.*?)(<\s*\/\2[^>]*>)';
our $WHITESPACES = [
{
regexp => qr/^\s*/s,
replacement => ''
},
{
regexp => qr/\s*$/s,
replacement => ''
},
{
regexp => '^\s*',
replacement => '',
modifier => 'm'
},
{
regexp => '[^\S\n]*$',
replacement => '',
modifier => 'm'
},
{
regexp => qr/(?<=>)[^<>]*(?=<)/sm,
replacement => sub {
my $match = $_[0]->{match};
$match =~ s/[^\S\n]{2,}/ /sg;
$match =~ s/\s*\n+\s*/\n/sg;
return $match;
}
},
{
regexp => '<\s*(\/)?\s*',
replacement => sub {
return sprintf( '<%s', $_[0]->{submatches}->[0] );
},
modifier => 's'
},
{
regexp => '\s*(\/)?\s*>',
replacement => sub {
return sprintf( '%s>', $_[0]->{submatches}->[0] );
},
modifier => 's'
}
];
our $NEWLINES_TAGS = [
{
regexp => '(\s*)(<\s*\/?\s*(?:' . join( '|', @SAVE_SPACE_ELEMENTS ) . ')\b[^>]*>)(\s*)',
replacement => sub {
return sprintf( '%s%s%s', $_[0]->{submatches}->[0] ? ' ' : '', $_[0]->{submatches}->[1], $_[0]->{submatches}->[2] ? ' ' : '' );
},
modifier => 'is'
}
];
our $NEWLINES = [
{
regexp => '(.)\n(.)',
replacement => sub {
my ( $pre, $post ) = @{$_[0]->{submatches}};
my $ret;
if ( $pre eq '>' or $post eq '<' ) {
$ret = $pre . $post;
}
elsif ( $pre =~ /[\w-]/ and $post =~ /[\w-]/ ) {
$ret = $pre . ' ' . $post;
}
else {
$ret = $pre . $post;
}
return $ret;
}
}
];
our @REGGRPS = ( 'newlines', 'newlines_tags', 'whitespaces', 'void_elements' );
our $GLOBAL_REGGRP = 'global';
##########################################################################################
{
no strict 'refs';
foreach my $field ( @BOOLEAN_ACCESSORS ) {
next if defined *{ __PACKAGE__ . '::' . $field }{CODE};
*{ __PACKAGE__ . '::' . $field} = sub {
my ( $self, $value ) = @_;
$self->{'_' . $field} = $value ? 1 : undef if ( defined( $value ) );
return $self->{'_' . $field};
};
}
foreach my $reggrp ( @REGGRPS, $GLOBAL_REGGRP ) {
next if defined *{ __PACKAGE__ . '::reggrp_' . $reggrp }{CODE};
*{ __PACKAGE__ . '::reggrp_' . $reggrp } = sub {
my ( $self ) = shift;
return $self->{ '_reggrp_' . $reggrp };
};
}
}
sub do_javascript {
my ( $self, $value ) = @_;
if ( defined( $value ) ) {
if ( grep( $value eq $_, @JAVASCRIPT_OPTS ) ) {
$self->{_do_javascript} = $value;
}
elsif ( ! $value ) {
$self->{_do_javascript} = undef;
}
}
return $self->{_do_javascript};
}
sub do_stylesheet {
my ( $self, $value ) = @_;
if ( defined( $value ) ) {
if ( grep( $value eq $_, @CSS_OPTS ) ) {
$self->{_do_stylesheet} = $value;
}
elsif ( ! $value ) {
$self->{_do_stylesheet} = undef;
}
}
return $self->{_do_stylesheet};
}
sub init {
my $class = shift;
my $self = {};
bless( $self, $class );
$self->{whitespaces}->{reggrp_data} = $WHITESPACES;
$self->{newlines}->{reggrp_data} = $NEWLINES;
$self->{newlines_tags}->{reggrp_data} = $NEWLINES_TAGS;
$self->{global}->{reggrp_data} = [
{
regexp => $DOCTYPE,
replacement => sub {
return '<!--~' . $_[0]->{store_index} . '~-->';
},
store => sub {
my $doctype = $_[0]->{match};
$doctype =~ s/\s+/ /gsm;
return $doctype;
}
},
{
regexp => $COMMENT,
replacement => sub {
return $self->remove_comments() ? (
$self->remove_newlines() ? ' ' : (
( $_[0]->{submatches}->[0] =~ /\n/s or $_[0]->{submatches}->[2] =~ /\n/s ) ? "\n" : ''
)
) : '<!--~' . $_[0]->{store_index} . '~-->';
},
store => sub {
my $ret = $self->remove_comments() ? '' : (
( ( not $self->remove_newlines() and $_[0]->{submatches}->[0] =~ /\n/s ) ? "\n" : '' ) .
$_[0]->{submatches}->[1] .
( ( not $self->remove_newlines() and $_[0]->{submatches}->[2] =~ /\n/s ) ? "\n" : '' )
);
return $ret;
}
},
{
regexp => $DONT_CLEAN,
replacement => sub {
return '<!--~' . $_[0]->{store_index} . '~-->';
},
store => sub {
my ( $opening, undef, $content, $closing ) = @{$_[0]->{submatches}};
if ( $content ) {
my $opening_script_re = '<\s*script' . ( $self->html5() ? '[^>]*>' : '[^>]*(?:java|ecma)script[^>]*>' );
my $opening_style_re = '<\s*style' . ( $self->html5() ? '[^>]*>' : '[^>]*text\/css[^>]*>' );
if ( $opening =~ /$opening_script_re/i ) {
$opening =~ s/ type="(text\/)?(java|ecma)script"//i if ( $self->html5() );
if ( $self->javascript_packer() and $self->do_javascript() ) {
$self->javascript_packer()->minify( \$content, { compress => $self->do_javascript() } );
unless ( $self->html5() ) {
$content = '/*<![CDATA[*/' . $content . '/*]]>*/';
}
}
}
elsif ( $opening =~ /$opening_style_re/i ) {
$opening =~ s/ type="text\/css"//i if ( $self->html5() );
if ( $self->css_packer() and $self->do_stylesheet() ) {
$self->css_packer()->minify( \$content, { compress => $self->do_stylesheet() } );
$content = "\n" . $content if ( $self->do_stylesheet() eq 'pretty' );
}
}
}
else {
$content = '';
}
$self->reggrp_whitespaces()->exec( \$opening );
$self->reggrp_whitespaces()->exec( \$closing );
return $opening . $content . $closing;
},
modifier => 'ism'
}
];
$self->{void_elements}->{reggrp_data} = [
{
regexp => '<\s*((?:' . join( '|', @VOID_ELEMENTS ) . ')\b[^>]*)\s*\/>',
replacement => sub {
return '<' . $_[0]->{submatches}->[0] . '>';
},
modifier => 'ism'
}
];
foreach ( @HTML::Packer::REGGRPS ) {
$self->{ '_reggrp_' . $_ } = Regexp::RegGrp->new( { reggrp => $self->{$_}->{reggrp_data} } );
}
$self->{ '_reggrp_' . $GLOBAL_REGGRP } = Regexp::RegGrp->new(
{
reggrp => $self->{$GLOBAL_REGGRP}->{reggrp_data},
restore_pattern => qr/<!--~(\d+)~-->/
}
);
return $self;
}
sub minify {
my ( $self, $input, $opts );
unless (
ref( $_[0] ) and
ref( $_[0] ) eq __PACKAGE__
) {
$self = __PACKAGE__->init();
shift( @_ ) unless ( ref( $_[0] ) );
( $input, $opts ) = @_;
}
else {
( $self, $input, $opts ) = @_;
}
if ( ref( $input ) ne 'SCALAR' ) {
carp( 'First argument must be a scalarref!' );
return undef;
}
my $html;
my $cont = 'void';
if ( defined( wantarray ) ) {
my $tmp_input = ref( $input ) ? ${$input} : $input;
$html = \$tmp_input;
$cont = 'scalar';
}
else {
$html = ref( $input ) ? $input : \$input;
}
if ( ref( $opts ) eq 'HASH' ) {
foreach my $field ( @BOOLEAN_ACCESSORS ) {
$self->$field( $opts->{$field} ) if ( defined( $opts->{$field} ) );
}
$self->do_javascript( $opts->{do_javascript} ) if ( defined( $opts->{do_javascript} ) );
$self->do_stylesheet( $opts->{do_stylesheet} ) if ( defined( $opts->{do_stylesheet} ) );
}
if ( not $self->no_compress_comment() and ${$html} =~ /$PACKER_COMMENT/s ) {
my $compress = $1;
if ( $compress eq '_no_compress_' ) {
return ( $cont eq 'scalar' ) ? ${$html} : undef;
}
}
$self->reggrp_global()->exec( $html );
$self->reggrp_whitespaces()->exec( $html );
if ( $self->remove_newlines() ) {
$self->reggrp_newlines_tags()->exec( $html );
$self->reggrp_newlines()->exec( $html );
}
if ( $self->html5() ) {
$self->reggrp_void_elements()->exec( $html );
}
$self->reggrp_global()->restore_stored( $html );
return ${$html} if ( $cont eq 'scalar' );
}
sub javascript_packer {
my $self = shift;
unless ( $self->{_checked_javascript_packer} ) {
eval "use JavaScript::Packer $REQUIRED_JAVASCRIPT_PACKER;";
unless ( $@ ) {
$self->{_javascript_packer} = eval {
JavaScript::Packer->init();
};
}
$self->{_checked_javascript_packer} = 1;
}
return $self->{_javascript_packer};
}
sub css_packer {
my $self = shift;
unless ( $self->{_checked_css_packer} ) {
eval "use CSS::Packer $REQUIRED_CSS_PACKER;";
unless ( $@ ) {
$self->{_css_packer} = eval {
CSS::Packer->init();
};
}
$self->{_checked_css_packer} = 1;
}
return $self->{_css_packer};
}
1;
__END__
=head1 NAME
HTML::Packer - Another HTML code cleaner
=for html
<a href='https://travis-ci.org/leejo/html-packer-perl?branch=master'><img src='https://travis-ci.org/leejo/html-packer-perl.svg?branch=master' alt='Build Status' /></a>
<a href='https://coveralls.io/r/leejo/html-packer-perl'><img src='https://coveralls.io/repos/leejo/html-packer-perl/badge.png?branch=master' alt='Coverage Status' /></a>
=head1 VERSION
Version 2.00
=head1 DESCRIPTION
A HTML Compressor.
=head1 SYNOPSIS
use HTML::Packer;
my $packer = HTML::Packer->init();
$packer->minify( $scalarref, $opts );
To return a scalar without changing the input simply use (e.g. example 2):
my $ret = $packer->minify( $scalarref, $opts );
For backward compatibility it is still possible to call 'minify' as a function:
HTML::Packer::minify( $scalarref, $opts );
First argument must be a scalarref of HTML-Code.
Second argument must be a hashref of options. Possible options are
=over 4
=item remove_comments
HTML-Comments will be removed if 'remove_comments' has a true value.
=item remove_newlines
ALL newlines will be removed if 'remove_newlines' has a true value.
=item do_javascript
Defines compression level for javascript. Possible values are 'clean', 'obfuscate', 'shrink' and 'best'.
Default is no compression for javascript.
This option only takes effect if L<JavaScript::Packer> is installed.
=item do_stylesheet
Defines compression level for CSS. Possible values are 'minify' and 'pretty'.
Default is no compression for CSS.
This option only takes effect if L<CSS::Packer> is installed.
=item no_compress_comment
If not set to a true value it is allowed to set a HTML comment that prevents the input being packed.
<!-- HTML::Packer _no_compress_ -->
Is not set by default.
=item html5
If set to a true value closing slashes will be removed from void elements.
=back
=head1 AUTHOR
Merten Falk, C<< <nevesenin at cpan.org> >>. Now maintained by Lee
Johnson (LEEJO).
=head1 BUGS
Please report any bugs or feature requests through
the web interface at L<https://github.com/leejo/html-packer-perl/issues>. I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.
=head1 SUPPORT
You can find documentation for this module with the perldoc command.
perldoc HTML::Packer
=head1 COPYRIGHT & LICENSE
Copyright 2009 - 2011 Merten Falk, all rights reserved.
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
=head1 SEE ALSO
L<HTML::Clean>
=cut