#############################################################################
## Name: HTMLParser.pm
## Purpose: XML::Smart::HTMLParser
## Author: Graciliano M. P.
## Modified by: Harish Madabushi
## Created: 29/05/2003
## RCS-ID:
## Copyright: (c) 2003 Graciliano M. P.
## Licence: This program is free software; you can redistribute it and/or
## modify it under the same terms as Perl itself
#############################################################################
package XML::Smart::HTMLParser ;
use 5.006 ;
use strict ;
use warnings ;
use XML::Smart::Shared qw( _unset_sig_warn _reset_sig_warn ) ;
our ($VERSION , @ISA) ;
$VERSION = '1.12' ;
#######
# NEW #
#######
sub new {
my $this = shift ;
my $class = ref($this) || $this ;
return $this if ref $this ;
$this = bless {} => $class ;
_unset_sig_warn() ;
my %args = @_ ;
_reset_sig_warn() ;
$this->setHandlers(%args) ;
$this->{NOENTITY} = 1 ;
return $this ;
}
###############
# SETHANDLERS #
###############
sub setHandlers {
my $this = shift ;
_unset_sig_warn() ;
my %args = @_;
_reset_sig_warn() ;
$this->{Init} = $args{Init} || sub{} ;
$this->{Start} = $args{Start} || sub{} ;
$this->{Char} = $args{Char} || sub{} ;
$this->{End} = $args{End} || sub{} ;
$this->{Final} = $args{Final} || sub{} ;
return( 1 ) ;
}
#########
# PARSE #
#########
sub parse {
my $this = shift ;
my $data = shift ;
$data =~ s/\r\n?/\n/gs ;
$data =~ s/^\s*<\?xml.*?>//gsi ;
my @parsed ;
while( $data =~ /(.*?)<(.*?)>/gsi ) {
my $cont = $1 ;
my $markup = $2 ;
my ( $more_q , @args ) = &parse_tag( $markup ) ;
while( $more_q ) {
my $more ;
( $more ) = ( $data =~ /\G(.*?)>/s ) ;
pos( $data ) += length( $more ) + 1 ;
$markup = $markup.'>'.$more ;
( $more_q , @args ) = &parse_tag( $markup ) ;
}
if( $cont =~ /\S/s ) {
push( @parsed , 'Char' , $cont ) ;
}
if( $args[0] =~ /^\/(.*)/ ) {
push( @parsed , 'End' , $1 ) ;
} elsif( $args[-1] =~ /^\/$/ ) {
pop @args ;
push( @parsed , 'StartEnd' , [@args] ) ;
} else {
push( @parsed , 'Start' , [@args] ) ;
}
}
{
my ( %close, @close, %open ) ;
for( my $i = ( $#parsed-1 ); $i >= 0; $i-=2 ) {
my $type = $parsed[$i] ;
if( $type eq 'End' ) {
my $tag = $parsed[ $i+1 ] ;
$close{ lc( $tag ) }++ ;
push( @close , $i ) ;
} elsif ( $type eq 'Start' ) {
my $tag = @{ $parsed[$i+1] }[0] ;
if( !$close{lc($tag)} ) {
if( @{$parsed[$i+1]}[-1] eq '/' && $#{$parsed[$i+1]} % 2 ) {
pop @{$parsed[$i+1]} ;
$parsed[$i] = 'StartEnd' ;
} elsif( $parsed[$i+2] ne 'Char') {
$parsed[$i] = 'StartEnd' ;
} else {
push( @{ $open{$close[-1]} } , 'End' , $tag ) ;
}
} else {
$close{lc($tag)}-- ;
pop(@close) ;
}
}
}
if ( %open ) {
my @parsed2 ;
for( my $i=0 ; $i <= $#parsed ; ++$i ) {
push( @parsed2, @{$open{ $i } } ) if $open{ $i } ;
push( @parsed2, $parsed[ $i ] ) ;
}
@parsed = @parsed2 ;
}
}
&{$this->{Init}}($this) ;
for( my $i = 0 ; $i <= $#parsed ; $i+=2 ) {
my $type = $parsed[ $i ] ;
my $args = $parsed[ $i+1 ] ;
if ($type eq 'Start' ) { &{$this->{Start}}( $this , ref($args) ? @{$args} : $args ) ;}
elsif ($type eq 'Char' ) { &{$this->{Char}}( $this , ref($args) ? @{$args} : $args ) ;}
elsif ($type eq 'End' ) { &{$this->{End}}( $this , ref($args) ? @{$args} : $args ) ;}
elsif ($type eq 'StartEnd') {
&{$this->{Start}}( $this , ref($args) ? @{$args} : $args ) ;
&{$this->{End}}( $this , ref($args) ? @{$args}[0] : $args ) ;
}
}
return &{$this->{Final}}($this) ;
}
#############
# PARSE_TAG #
#############
sub parse_tag {
my $args = shift ;
#print "[$args]\n" ;
if ($args =~ /^!--/s) {
if ($args !~ /--$/s) { return('--') ;}
$args =~ s/^!--//s ;
$args =~ s/--$//s ;
return('' , '!--' , 'CONTENT' , $args ) ;
}
my @args ;
my ($type , $type_last) = (-1,-1) ;
while($args =~ /(?:^\s*)?(?:
(
\w+:\/\/[^'"\s]+ ## URI without quotes
|
[\w:\.-]+ ## words
)
|
([^'"=\s]+) ## unquoted values
|
(=) ## equal between name and value
|
## Quote: '...'
('
(?:
'
|
(?:(?:\\')?[^'])+(?:'{1,2}|.*)
)
)
|
## Quote: "..."
("
(?:
"
|
(?:(?:\\")?[^"])+(?:"{1,2}|.*)
)
)
)/gsx) {
my $got ;
_unset_sig_warn() ;
if ($1 ne '') { $got = $1 ;}
elsif ($2 ne '') { $got = $2 ;}
elsif ($3 ne '') { $got = $3 ;}
elsif ($4 ne '') { $got = $4 ;}
elsif ($5 ne '') { $got = $5 ;}
else {
_reset_sig_warn() ;
next ;
}
_reset_sig_warn() ;
if ($got =~ /^(['"])/s) {
my $q = $1 ;
if ($got !~ /$q$/s || $got =~ /\\$q$/s) { return($q) ;}
else { $got =~ s/^$q//s ; $got =~ s/$q$//s ;}
}
if ($got eq '=') { $type = 1 ;}
else {
if ($type_last == 0 && $type == 0) { push(@args , '') ;}
push(@args , $got) ;
$type_last = $type ;
$type = 0 ;
}
}
#print "@args\n" ;
return( '' , @args ) ;
}
#######
# END #
#######
1;