The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#  $Id: TableExtractor.pm,v 1.2 2002/06/11 15:52:25 simon Exp $


package HTML::TableExtractor;

use HTML::Parser;

@ISA = qw(HTML::Parser);

use strict;


our $VERSION = 0.11;

# The tags we're interested in.
my @tag_names = qw(table tr td th);



sub start
{
	my ($self, $tag, $attr, $attrseq, $origtext) = @_;

	return unless grep { $_ eq lc($tag) } @tag_names;

	if (ref($self->{"${tag}_start_callback"}) eq 'CODE') {
		&{$self->{"${tag}_start_callback"}}($attr, $origtext);
	}
	if (ref($self->{"${tag}_callback"}) eq 'CODE') {
		&{$self->{"${tag}_callback"}}($attr, $origtext);
	}

}




sub end
{
	my ($self, $tag, $origtext) = @_;

	return unless grep { $_ eq lc($tag) } @tag_names;

	if (ref($self->{"${tag}_callback"}) eq 'CODE') {
		&{$self->{"${tag}_callback"}}($origtext);
	}
	if (ref($self->{"${tag}_end_callback"}) eq 'CODE') {
		&{$self->{"${tag}_end_callback"}}($origtext);
	}
}



sub parse
{
	my ($self, $data, @types) = @_;
	my %cbs = @types;

	for (@tag_names) {
		$self->{$_ . "_callback"} = $cbs{$_} if exists $cbs{$_};
		$self->{$_ . "_start_callback"} = $cbs{"start_$_"}
			if exists $cbs{"start_$_"};
		$self->{$_ . "_end_callback"} = $cbs{"end_$_"}
			if exists $cbs{"end_$_"};
	}
	$self->SUPER::parse($data);
}




1;

__END__

=head1 NAME

HTML::TableExtractor - Do stuff with the layout of HTML tables.

=head1 SYNOPSIS

  use HTML::TableExtractor;
  $p = HTML::TableExtractor->new();
  $p->parse($html, 	table => sub { ... }, tr => sub { ... });

=head1 DESCRIPTION

Parses HTML looking for table-related elements (table, tr, td and th as of 
version 0.1).

Three callbacks can be registered for each element. These callbacks,
described below, are executed whenever an element of a particular type is
encountered.
  
  o  start_${tagname}  Called whenever $tagname is opened.
  o  ${tagname}        Called immediately after start_${tagname}, and
		                   immediately before end_${tagname}.
  o  end_${tagname}    Called whenever a closing $tagname is encountered.


=head2 EXAMPLE

  use HTML::TableExtractor;
  $p = HTML::TableExtractor->new();
  $p->parse($html,
      start_table => sub {
        my ($attr, $origtext) = @_;
        print "Table border is $table->{border}\n";
      },
      tr => sub { print "Row opened or closed.\n" },
      );

	
=head1 METHODS

=over 4

=item start($parser, $tag, $attr, $attrseq, $origtext);

Called whenever a particular start tag has been recognised. This module
recognises these tags: <table>, <tr>, <td> & <th>.

This method will be called by the parser and is not intended to be called from
an application. 

=item end($parser, $tag, $origtext); 

Called whenever a particular end tag is encountered.

This method will be called by the parser and is not intended to be called from
an application. 

=item $p->parse($html, tag_type => \&coderef, ...);

This method is all you really need to do. Call it with callbacks for each tag
type. These will be executed as described above.


=back

=head2 EXPORTS


=head2 CAVEATS, BUGS, and TODO

o  parse() should handle other data sources, such as streaming, file handle
etc.


=head2 SEE ALSO

HTML::Parser, HTML::TableContentParser

=head1 AUTHOR

Simon Drabble  E<lt>simon@thebigmachine.org<gt>

(C) 2002  Simon Drabble  

This software is released under the same terms as perl.

=cut