The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#############################################################################
## Name:        HTMLParser.pm
## Purpose:     XML::Smart::HTMLParser
## Author:      Graciliano M. P.
## Modified by: Harish Madabushi
## Created:     29/05/2003
## RCS-ID:      
## Copyright:   (c) 2003 Graciliano M. P.
## Licence:     This program is free software; you can redistribute it and/or
##              modify it under the same terms as Perl itself
#############################################################################

package XML::Smart::HTMLParser                                 ;

use 5.006                                                      ;

use strict                                                     ;
use warnings                                                   ;

use XML::Smart::Shared qw( _unset_sig_warn _reset_sig_warn )   ;

our ($VERSION , @ISA) ;
$VERSION = '1.12' ;

#######
# NEW #
#######

sub new { 
  my $this = shift ;
  my $class = ref($this) || $this ;
  return $this if ref $this ;

  $this = bless {} => $class ;
  
  _unset_sig_warn() ;
  my %args = @_ ;
  _reset_sig_warn() ;
  $this->setHandlers(%args) ;
  
  $this->{NOENTITY} = 1 ;
  
  return $this ;
}

###############
# SETHANDLERS #
###############

sub setHandlers {
  my $this = shift ;
  _unset_sig_warn() ;
  my %args = @_;
  _reset_sig_warn() ;
    
  $this->{Init}  = $args{Init} || sub{} ;
  $this->{Start} = $args{Start} || sub{} ;
  $this->{Char}  = $args{Char} || sub{} ;
  $this->{End}   = $args{End} || sub{} ;
  $this->{Final} = $args{Final} || sub{} ;
  
  return( 1 ) ;

}

#########
# PARSE #
#########

sub parse {

    my $this = shift ;
    my $data = shift ;
    
    $data =~ s/\r\n?/\n/gs ;
    
    $data =~ s/^\s*<\?xml.*?>//gsi ;
    
    my @parsed ;
    
    while( $data =~ /(.*?)<(.*?)>/gsi ) {

	my $cont = $1 ;
	my $markup = $2 ;
	
	my ( $more_q , @args ) = &parse_tag( $markup ) ;
	
	while( $more_q ) {
	    my $more ;
	    ( $more ) = ( $data =~ /\G(.*?)>/s ) ;
	    pos( $data ) += length( $more ) + 1 ;
	    $markup = $markup.'>'.$more ;
	    ( $more_q , @args ) = &parse_tag( $markup ) ;
	}
	
	if( $cont =~ /\S/s ) { 
	    push( @parsed , 'Char' , $cont ) ;
	}
	
	if( $args[0] =~ /^\/(.*)/ ) { 
	    push( @parsed , 'End'      , $1      ) ;
	} elsif( $args[-1] =~ /^\/$/ ) {
	    pop @args ;
	    push( @parsed , 'StartEnd' , [@args] ) ;
	} else { 
	    push( @parsed , 'Start'    , [@args] ) ;
	}
    }
    
    {
	
	my ( %close, @close, %open ) ;

	for( my $i = ( $#parsed-1 ); $i >= 0; $i-=2 ) {

	    my $type = $parsed[$i] ;
	    
	    if( $type eq 'End' ) {
		my $tag = $parsed[ $i+1 ] ;
		$close{ lc( $tag ) }++    ;
		push( @close , $i )       ;
	    } elsif ( $type eq 'Start' ) {

		my $tag = @{ $parsed[$i+1] }[0] ;
		
		if( !$close{lc($tag)} ) {
		    if( @{$parsed[$i+1]}[-1] eq '/' && $#{$parsed[$i+1]} % 2 ) {
			pop @{$parsed[$i+1]}     ;
			$parsed[$i] = 'StartEnd' ;
		    } elsif( $parsed[$i+2] ne 'Char') { 
			$parsed[$i] = 'StartEnd' ;
		    } else {
			push( @{ $open{$close[-1]} }  , 'End' , $tag ) ;
		    }
		} else {
		    $close{lc($tag)}-- ;
		    pop(@close)        ;
		}
	    }
	}
	
	if ( %open ) {
	    my @parsed2 ;
	    for( my $i=0 ; $i <= $#parsed ; ++$i ) {
		push( @parsed2, @{$open{ $i } } ) if $open{ $i } ;
		push( @parsed2, $parsed[ $i ]   )                ;
	    }
	    @parsed = @parsed2 ;
	}
	
    }
    
    &{$this->{Init}}($this) ;
    
    for( my $i = 0 ; $i <= $#parsed ; $i+=2 ) {
	my $type = $parsed[ $i   ] ;
	my $args = $parsed[ $i+1 ] ;
	
	if    ($type eq 'Start'   ) { &{$this->{Start}}( $this , ref($args) ? @{$args} : $args )  ;}
	elsif ($type eq 'Char'    ) { &{$this->{Char}}(  $this , ref($args) ? @{$args} : $args )  ;}
	elsif ($type eq 'End'     ) { &{$this->{End}}(   $this , ref($args) ? @{$args} : $args )  ;}
	elsif ($type eq 'StartEnd') {
	    &{$this->{Start}}( $this , ref($args) ? @{$args} : $args ) ;
	    &{$this->{End}}( $this , ref($args) ? @{$args}[0] : $args ) ;
	}
    }
    
    return &{$this->{Final}}($this) ;

}

#############
# PARSE_TAG #
#############

sub parse_tag {
  my $args = shift ;
  
  #print "[$args]\n" ;
  
  if ($args =~ /^!--/s) {
    if ($args !~ /--$/s) { return('--') ;}
    
    $args =~ s/^!--//s ;
    $args =~ s/--$//s ;
    
    return('' , '!--' , 'CONTENT' , $args ) ;
  }
  
  
  my @args ;
  my ($type , $type_last) = (-1,-1) ;
  
  while($args =~ /(?:^\s*)?(?:
    (
     \w+:\/\/[^'"\s]+  ## URI without quotes
     |
     [\w:\.-]+    ## words
    )

  |
  ([^'"=\s]+)    ## unquoted values
  |
  (=) ## equal between name and value
  |
    ## Quote: '...'
    ('
      (?:
        '
        |
        (?:(?:\\')?[^'])+(?:'{1,2}|.*)
      )
    )
  
  |
    ## Quote: "..."
    ("
      (?:
        "
        |
        (?:(?:\\")?[^"])+(?:"{1,2}|.*)
      )
    )

  )/gsx) {
    my $got ;
    _unset_sig_warn() ;
    if    ($1 ne '') { $got = $1 ;}
    elsif ($2 ne '') { $got = $2 ;}
    elsif ($3 ne '') { $got = $3 ;}
    elsif ($4 ne '') { $got = $4 ;}
    elsif ($5 ne '') { $got = $5 ;}
    else { 
	_reset_sig_warn() ;
	next ;
    }
    _reset_sig_warn() ;
    
    if ($got =~ /^(['"])/s) {
      my $q = $1 ;
      if ($got !~ /$q$/s || $got =~ /\\$q$/s) { return($q) ;}
      else { $got =~ s/^$q//s ; $got =~ s/$q$//s ;}
    }
    if ($got eq '=') { $type = 1 ;}
    else {
      if ($type_last == 0 && $type == 0) { push(@args , '') ;}
      push(@args , $got) ;
      $type_last = $type ;
      $type = 0 ;
    }
  }
  
  #print "@args\n" ;
  
  return( '' , @args ) ;
}

#######
# END #
#######

1;