The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
Changes 021
MANIFEST 01
META.yml 22
Makefile.PL 11
README 22
lib/HTML/TableExtract.pm 142322
t/01_pod.t 11
t/02_pod_coverage.t 11
t/10_bulk.t 22
t/20_skew.t 01
t/30_tree.t 107
t/gnarly.html 01
t/testload.pm 933
13 files changed (This is a version diff) 170395
@@ -1,5 +1,26 @@
 Revision history for HTML::TableExtract
 
+2.10  Sat Jul 15 20:50:41 EDT 2006
+        - minor bug fixed in HTML repair routines (thanks to Dave Gray)
+
+2.09  Thu Jun  8 15:46:17 EDT 2006
+        - Tweaked rasterizer to handle some situations where the HTML is
+          broken but tables can still be inferred.
+        - Fixed TREE() definition for situations where import() is
+          not invoked. (thanks to DDICK on cpan.org)
+
+2.08  Wed May  3 17:17:33 EDT 2006
+        - Implemented new rasterizer for grid mapping. Thanks to Roland
+          Schar for a tortuous example of span issues.
+        - This also fixes a bug the old skew method had when it
+          encountered ridiculously large spans (out of memory). Thanks
+          to Andreas Gustafsson.
+        - Regular extraction and TREE mode are using the same
+          rasterizer now.
+        - Fixed HTML stripping for a header matching bug on single word
+          text in keep_html mode (thanks to Michael S. Muegel for
+          pointing the bug out)
+
 2.07  Sun Feb 19 13:40:44 EST 2006
         - Fixed subtable slicing bug
         - Fixed hrow() attachment bug
@@ -20,6 +20,7 @@ t/30_tree.t
 t/testload.pm
 t/basic.html
 t/basic2.html
+t/gnarly.html
 t/skew.html
 t/subtable.html
 t/ugly.html
@@ -1,11 +1,11 @@
 # http://module-build.sourceforge.net/META-spec.html
 #XXXXXXX This is a prototype!!!  It will change in the future!!! XXXXX#
 name:         HTML-TableExtract
-version:      2.07
+version:      2.10
 version_from: lib/HTML/TableExtract.pm
 installdirs:  site
 requires:
-    HTML::ElementTable:            1.13
+    HTML::ElementTable:            1.16
     HTML::Parser:                  0
 
 distribution_type: module
@@ -7,7 +7,7 @@ my %prereq_pm = (
 # The idea is to skip tests and dependencies on H::ET if it is not
 # installed at all. If it is presently installed, however, enforce the
 # version dependency.
-my $et_version = '1.13';
+my $et_version = '1.16';
 eval "use HTML::ElementTable";
 unless ($@) {
   $prereq_pm{'HTML::ElementTable'} = $et_version;
@@ -61,8 +61,8 @@ Thanks to the following people for their generous bug catching, fault
 analysis, and suggestions: Celeste Suliin Burris, Jeff Casey, David
 Finberg, Michael Fowler, Robert Goff, Klaus Gottschalk, Daniel Griscom,
 Jeremy Howard, Martin Joost, Jeff Lewwid, Nicholas R. Markham, Julian
-Mehnle, Patrick Naubert, Jani Ollikainen, Wilson Snyder, Volker Stuerzl,
-Steve Wong, and Matt Zip.
+Mehnle, Michael S. Muegel, Patrick Naubert, Jani Ollikainen, Wilson
+Snyder, Volker Stuerzl, Steve Wong, and Matt Zip.
 
 
 COPYRIGHT
@@ -12,7 +12,7 @@ use Carp;
 
 use vars qw($VERSION @ISA);
 
-$VERSION = '2.07';
+$VERSION = '2.10';
 
 use HTML::Parser;
 @ISA = qw(HTML::Parser);
@@ -23,6 +23,8 @@ use HTML::Entities;
 # default HTML::Parser. (use HTML::TableExtract qw(tree);) Also installs
 # a mode constant TREE().
 
+BEGIN { *TREE = sub { 0 } }
+
 sub import {
     my $class = shift;
     no warnings;
@@ -32,7 +34,7 @@ sub import {
     croak "Unknown mode '$mode'\n" unless $mode eq 'tree';
     eval "use HTML::TreeBuilder";
     croak "Problem loading HTML::TreeBuilder : $@\n" if $@;
-    eval "use HTML::ElementTable 1.13";
+    eval "use HTML::ElementTable 1.17";
     croak "problem loading HTML::ElementTable : $@\n" if $@;
     @ISA = qw(HTML::TreeBuilder);
     $class;
@@ -126,16 +128,12 @@ sub start {
       ++$skiptag;
     }
     elsif ($_[0] eq 'td' || $_[0] eq 'th') {
-      $ts->_enter_cell;
-      # Inspect rowspan/colspan attributes, record as necessary for
-      # future column count transforms.
-      if ($self->{gridmap}) {
-        my %attrs = ref $_[1] ? %{$_[1]} : {};
-        my $rspan = $attrs{rowspan} || 1;
-        my $cspan = $attrs{colspan} || 1;
-        $ts->_skew($rspan, $cspan);
-        $ts->_set_grid($rspan, $cspan, @res);
-      }
+      $ts->_enter_cell(@_);
+      my %attrs = ref $_[1] ? %{$_[1]} : {};
+      my $rspan = $attrs{rowspan} || 1;
+      my $cspan = $attrs{colspan} || 1;
+      $ts->_rasterizer->($ts->row_count, $rspan, $cspan);
+      $ts->_anchor_item(@res);
       ++$skiptag;
     }
     if ($self->{keep_html} && !$skiptag) {
@@ -238,28 +236,19 @@ sub tables {
   @{$self->{_ts_sequential}};
 }
 
-# we are an HTML::TreeBuilder, which is an HTML::Element structure after
-# parsing...but we provide this for consistency with the table object
-# method for accessing the tree structures.
+# in tree mode, we already are an HTML::TreeBuilder, which is an
+# HTML::Element structure after parsing...but we provide this for
+# consistency with the table object method for accessing the tree
+# structures.
 
 sub tree { shift }
 
 sub tables_report {
   # Print out a summary of extracted tables, including depth/count
-  my($self, $include_content, $col_sep) = @_;
-  $col_sep ||= ':';
+  my $self = shift;
   my $str;
   foreach my $ts ($self->tables) {
-    $str .= "TABLE(" . $ts->depth . ", " . $ts->count . ')';
-    if ($include_content) {
-      $str .= ":\n";
-      foreach my $row ($ts->rows) {
-        $str .= join($col_sep, @$row) . "\n";
-      }
-    }
-    else {
-      $str .= "\n";
-    }
+    $str .= $ts->report(@_);
   }
   $str;
 }
@@ -334,7 +323,8 @@ sub _enter_table {
   # if we are under an umbrella. Notice that with table states, "depth"
   # and "count" are absolute coordinates recording where this table was
   # created, whereas "tdepth" and "tcount" are the target constraints.
-  # Headers "absolute" meaning, therefore are passed by the same name.
+  # Headers have "absolute" meaning, therefore are passed by the
+  # same name.
   if (!$umbrella) {
     $tsparms{tdepth}   = $self->{depth};
     $tsparms{tcount}   = $self->{count};
@@ -357,8 +347,17 @@ sub _exit_table {
   my $ts = $self->current_table;
 
   # Last ditch fix for HTML mangle
-  $ts->_exit_cell if $ts->{in_cell};
-  $ts->_exit_row if $ts->{in_row};
+  if ($ts->{in_cell}) {
+    $self->_emsg("Mangled HTML in table ($self->{depth},$self->{count}), forcing exit of cell ($ts->{rc},$ts->{cc}) due to table exit\n") if $self->{debug};
+    $ts->_exit_cell;
+  }
+  if ($ts->{in_row}) {
+    $self->_emsg("Mangled HTML in table ($self->{depth},$self->{count}), forcing exit of row $ts->{rc} due to table exit\n") if $self->{debug};
+    $ts->_exit_row;
+  }
+
+  # transform from tree to grid using our rasterized template
+  $ts->_grid_map();
 
   $self->_capture_table($ts) if $ts->_check_triggers;
 
@@ -386,9 +385,7 @@ sub _capture_table {
     $msg .= "\n";
     $self->_emsg($msg);
   }
-  if (TREE()) {
   $ts->tree(HTML::ElementTable->new_from_tree($ts->tree)) if TREE();
-  }
   if ($self->{subtables}) {
     foreach my $child (@{$ts->{children}}) {
       next if $child->{captured};
@@ -451,7 +448,6 @@ sub _emsg {
                  rc          => -1,
                  cc          => -1,
                  grid        => [],
-                 gridalias   => [],
                  translation => [],
                  hrow        => [],
                  order       => [],
@@ -459,6 +455,8 @@ sub _emsg {
                  captured    => 0,
                  debug       => 0,
                 };
+
+    $self->{_rastamon} = HTML::TableExtract::Rasterize->make_rasterizer();
     bless $self, $class;
 
     my %parms = @_;
@@ -481,10 +479,12 @@ sub _emsg {
     $self;
   }
 
-  sub _set_grid {
-    my($self, $rspan, $cspan, @res) = @_;
-    my $row = $self->{rc};
-    my $col = $self->_skew;
+  sub _anchor_item {
+    # anchor the reference to a cell in our grid -- in TREE mode this is
+    # a reference to a data element, otherwise it's a reference to an
+    # empty scalar in which we will collect our text.
+    my($self, @res) = @_;
+    my $row  = $self->{grid}[-1];
     my $item;
     if (@res && ref $res[0]) {
       $item = $res[0];
@@ -493,19 +493,75 @@ sub _emsg {
       my $scalar_ref;
       $item = \$scalar_ref;
     }
-    $self->{grid}[$row][$col]        = $item;
-    $self->{gridalias}[$row][$col]   = $item;
-    $self->{translation}[$row][$col] = "$row,$col";
-    foreach my $rc (0 .. $rspan - 1) {
-      foreach my $cc (0 .. $cspan - 1) {
-        my($r, $c) = ($row + $rc, $col + $cc);
-        next if $r == $row && $c == $col;
-        my $blank;
-        $self->{grid}[$r][$c] = \$blank;
-        $self->{gridalias}[$r][$c] = $item;
-        $self->{translation}[$r][$c] = "$row,$col";
+    push(@$row, $item);
+  }
+
+  sub _gridalias {
+    my $self = shift;
+    $self->{gridalias} ||= $self->_make_gridalias;
+  }
+
+  sub _grid_map {
+    # using our rasterized template, flesh out our captured items which
+    # are still in 'tree' format
+    my $self = shift;
+    my $template = $self->_rasterizer->();
+    my $grid = $self->{grid};
+    # drop empty rows
+    if ($self->{debug}) {
+      foreach (0 .. $#$grid) {
+        next if @{$grid->[$_]};
+        $self->_emsg("Dropping empty row $_\n");
+      }
+    }
+    @$grid = grep(@$_, @$grid);
+    foreach my $r (0 .. $#$template) {
+      my $row  = $grid->[$r];
+      my $trow = $template->[$r];
+      $self->_emsg("Flesh row $r ($#$row) to $#$trow\n") if $self->{debug} > 1;
+      foreach my $c (0 .. $#$trow) {
+        print STDERR $trow->[$c] ? '1' : '0' if $self->{debug} > 1;
+        if ($trow->[$c]) {
+          if (! defined $row->[$c]) {
+            $row->[$c] = \undef;
+          }
+          next;
+        }
+        else {
+          my $scalar;
+          splice(@$row, $c, 0, \$scalar);
+        }
+      }
+      print STDERR "\n" if $self->{debug} > 1;
+      croak "row $r splice mismatch: $#$row vs $#$trow\n"
+        unless $#$row == $#$trow;
+    }
+    $grid;
+  }
+
+  sub _make_gridalias {
+    # our aliased grid will have references in masked cells to the same
+    # cell that is covering it via spanning.
+    my $self = shift;
+    my $grid = $self->{grid};
+    my $template = $self->_rasterizer->();
+    my(@gridalias, @translation);
+    $gridalias[$_] = [@{$grid->[$_]}] foreach 0 .. $#$grid;
+    foreach my $r (0 .. $#gridalias) {
+      my $row = $gridalias[$r];
+      foreach my $c (0 .. $#$row) {
+        my $tcell = $template->[$r][$c] || next;
+        my($rspan, $cspan) = @$tcell;
+        foreach my $rs (0 .. $rspan-1) {
+          foreach my $cs (0 .. $cspan-1) {
+            $gridalias[$r + $rs][$c + $cs] = $grid->[$r][$c];
+            $translation[$r + $rs][$c + $cs] = "$r,$c";
+          }
+        }
       }
     }
+    $self->{translation} = \@translation;
+    $self->{gridalias}   = \@gridalias;
   }
 
   ### Constraint tests
@@ -565,9 +621,8 @@ sub _emsg {
           if ($ref_type eq 'SCALAR') {
             my $item = $$ref;
             if ($self->{keep_html} && $self->{strip_html_on_match}) {
-              my $strip = HTML::TableExtract::StripHTML->new;
-              $strip->parse($item);
-              $target = $strip->tidbit;
+              my $stripper = HTML::TableExtract::StripHTML->new;
+              $target = $stripper->strip($item);
             }
             else {
               $target = $item;
@@ -646,24 +701,22 @@ sub _emsg {
 
   sub _enter_row {
     my $self = shift;
-    $self->_exit_cell if $self->{in_cell};
-    $self->_exit_row if $self->{in_row};
+    if ($self->{in_row}) {
+      $self->_emsg("Mangled HTML in table ($self->{depth},$self->{count}), forcing exit of row $self->{rc} due to new row\n") if $self->{debug};
+      $self->_exit_row;
+    }
     ++$self->{rc};
     ++$self->{in_row};
-
-    # Reset next_col for gridmapping
-    $self->{next_col} = 0;
-    while ($self->{taken}{"$self->{rc},$self->{next_col}"}) {
-      ++$self->{next_col};
-    }
-
     push(@{$self->{grid}}, [])
   }
 
   sub _exit_row {
     my $self = shift;
     if ($self->{in_row}) {
-      $self->_exit_cell if $self->{in_cell};
+      if ($self->{in_cell}) {
+        $self->_emsg("Mangled HTML in table ($self->{depth},$self->{count}), forcing exit of cell ($self->{rc}, $self->{cc}) due to new row\n") if $self->{debug};
+        $self->_exit_cell;
+      }
       $self->{in_row} = 0;
       $self->{cc} = -1;
     }
@@ -675,7 +728,10 @@ sub _emsg {
 
   sub _enter_cell {
     my $self = shift;
-    $self->_exit_cell if $self->{in_cell};
+    if ($self->{in_cell}) {
+      $self->_emsg("Mangled HTML in table ($self->{depth},$self->{count}), forcing exit of cell ($self->{rc},$self->{cc}) due to new cell\n") if $self->{debug};
+      $self->_exit_cell;
+    }
     if (!$self->{in_row}) {
       # Go ahead and try to recover from mangled HTML, because we care.
       $self->_emsg("Mangled HTML in table ($self->{depth},$self->{count}), inferring <TR> as row $self->{rc}\n")
@@ -684,6 +740,9 @@ sub _emsg {
     }
     ++$self->{cc};
     ++$self->{in_cell};
+    my %attrs = ref $_[1] ? %{$_[1]} : {};
+    my $rspan = $attrs{rowspan} || 1;
+    my $cspan = $attrs{colspan} || 1;
   }
 
   sub _exit_cell {
@@ -760,15 +819,23 @@ sub _emsg {
     @{$self->{lineage}};
   }
 
-  sub rows {
+  sub rows { shift->_rows(0) }
+
+  sub space_rows {
     my $self = shift;
+    $self->_rows(1);
+  }
+
+  sub _rows {
+    my $self  = shift;
+    my $alias = shift;
     my @ri = $self->row_indices;
     my @rows;
-    my $grid = $self->{grid};
+    my $grid = $alias ? $self->_gridalias : $self->{grid};
     foreach ($self->row_indices) {
       push(@rows, scalar $self->_slice_and_normalize_row($grid->[$_]));
     }
-    @rows;
+    wantarray ? @rows : \@rows;
   }
 
   sub columns {
@@ -856,9 +923,10 @@ sub _emsg {
   sub space {
     my $self = shift;
     my($r, $c) = @_;
-    $r <= $#{$self->{gridalias}}
-      or croak "row $r out of range ($#{$self->{gridalias}})\n";
-    my $row = $self->{gridalias}[$r];
+    my $gridalias = $self->_gridalias;
+    $r <= $#$gridalias
+      or croak "row $r out of range ($#$gridalias)\n";
+    my $row = $gridalias->[$r];
     $c <= $#$row or croak "Column $c out of range ($#$row)\n";
     $self->_cell_to_content($row->[$c]);
   }
@@ -928,70 +996,12 @@ sub _emsg {
   sub _add_text {
     my($self, $txt) = @_;
     my $r = $self->{rc};
+    my $c = $self->{cc};
     my $row = $self->{grid}[$r];
-    my $c = $self->_skew;
     ${$row->[$c]} .= $txt;
     $txt;
   }
 
-  sub _skew {
-    # Skew registers the effects of rowspan/colspan issues when gridmap
-    # is enabled.
-
-    my($self, $rspan, $cspan) = @_;
-    my($r,$c) = ($self->{rc},$self->{cc});
-
-    if ($self->{debug} > 6) {
-      $self->_emsg("($self->{rc},$self->{cc}) Inspecting skew for ($r,$c)");
-      $self->_emsg(defined $rspan ? " (set with $rspan,$cspan)\n" : "\n");
-    }
-
-    my $sc = $c;
-    if (! defined $self->{skew_cache}{"$r,$c"}) {
-      $sc = $self->{next_col} if defined $self->{next_col};
-      $self->{skew_cache}{"$r,$c"} = $sc;
-      my $next_col = $sc + 1;
-      while ($self->{taken}{"$r,$next_col"}) {
-        ++$next_col;
-      }
-      $self->{next_col} = $next_col;
-    }
-    else {
-      $sc = $self->{skew_cache}{"$r,$c"};
-    }
-
-    # If we have span arguments, set skews
-    if (defined $rspan) {
-      # Default span is always 1, even if not explicitly stated.
-      $rspan ||= 1;
-      $cspan ||= 1;
-      --$rspan; --$cspan;
-      # 1,1 is a degenerate case, there's nothing to do.
-      if ($rspan || $cspan) {
-        foreach my $rs (0 .. $rspan) {
-          my $cr = $r + $rs;
-          # If we in the same row as the skewer, the "span" is one less
-          # because the skewer cell occupies the same row.
-          my $start_col = $rs ? $sc : $sc + 1;
-          my $fin_col   = $sc + $cspan;
-          foreach ($start_col .. $fin_col) {
-            $self->{taken}{"$cr,$_"} ||= "$r,$sc";
-          }
-          if (!$rs) {
-            my $next_col = $fin_col + 1;
-            while ($self->{taken}{"$cr,$next_col"}) {
-              ++$next_col;
-            }
-            $self->{next_col} = $next_col;
-          }
-        }
-      }
-    }
-
-    # Grid column number
-    $sc;
-  }
-
   sub _reset_hits {
     my $self = shift;
     return unless $self->{headers};
@@ -1003,6 +1013,31 @@ sub _emsg {
     1;
   }
 
+  sub _rasterizer { shift->{_rastamon} }
+
+  sub report {
+    # Print out a summary of this table, including depth/count
+    my($self, $include_content, $col_sep) = @_;
+    $col_sep ||= ':';
+    my $str;
+    $str .= "TABLE(" . $self->depth . ", " . $self->count . ')';
+    if ($include_content) {
+      $str .= ":\n";
+      foreach my $row ($self->rows) {
+        $str .= join($col_sep, @$row) . "\n";
+      }
+    }
+    else {
+      $str .= "\n";
+    }
+    $str;
+  }
+
+  sub dump {
+    my $self = shift;
+    $self->_emsg($self->report(@_));
+  }
+
   sub _emsg {
     my $self = shift;
     my $fh = $self->{error_handle};
@@ -1015,6 +1050,149 @@ sub _emsg {
 
 {
 
+  package HTML::TableExtract::Rasterize;
+
+  # Provide a closure that will rasterize (turn into a grid) a table
+  # from a tree structure based on repeated data element calls with
+  # rowspan and colspan information. Not as straight forward as it
+  # seems...see test cases for an example bugaboo.
+
+  my $DEBUG = 0;
+
+  sub make_rasterizer {
+    my $pkg = shift;
+    my(@grid, @row_spinner, @col_spinner);
+    my $empty_row_offset = 0;
+    sub {
+      return \@grid unless @_;
+      my($row_num, $rspan, $cspan) = @_;
+      $rspan = 1 unless $rspan > 1;
+      $cspan = 1 unless $cspan > 1;
+      my($rspin_propogate, $row_added);
+      my $trigger = $#grid + $empty_row_offset;
+      if ($row_num > $trigger) {
+        # adjust for having been handed a row that skips a prior row,
+        # otherwise the next cell will land in a wrong row. Hopefully
+        # this doesn't happen too often but I've seen it in the wild!
+        if ($row_num - $trigger > 1) {
+          $empty_row_offset += $row_num - $trigger - 1;
+        }
+        # add new row
+        $row_added = 1;
+        my @new_row;
+        # first add new row spinner
+        if ($row_spinner[-1] && $col_spinner[-1]) {
+          push(@row_spinner, $row_spinner[-1]);
+          $rspin_propogate = 1;
+        }
+        else {
+          push(@row_spinner, $cspan - 1);
+        }
+        # spin columns
+        foreach (@col_spinner) {
+          if ($_) {
+            push(@new_row, 0);
+            --$_;
+          }
+          else {
+            push(@new_row, undef);
+          }
+        }
+        @new_row = (undef) unless @new_row;
+        push(@grid, \@new_row);
+      }
+      my $current_row = $grid[-1];
+      # locate next available cell in row
+      my $col;
+      foreach my $ci (0 .. $#$current_row) {
+        if (! defined $current_row->[$ci]) {
+          $col = $ci;
+          last;
+        }
+      }
+      if (! defined $col) {
+        ADDCOL: while (! defined $col) {
+          # if no cells were available, add a column
+          foreach my $ri (0 .. $#grid) {
+            my $row = $grid[$ri];
+            my $cspan_count = $row_spinner[$ri];
+            if (!$cspan_count) {
+              push(@$row, undef);
+            }
+            else {
+              push(@$row, 0);
+              --$row_spinner[$ri];
+            }
+          }
+          push(@col_spinner, $col_spinner[-1]);
+          foreach my $ci (0 .. $#$current_row) {
+            if (! defined $current_row->[$ci]) {
+              $col = $ci;
+              last ADDCOL;
+            }
+          }
+        }
+        $col_spinner[-1] = $rspan - 1 if $col == $#$current_row;
+        $row_spinner[$#grid] = $cspan - 1;
+      }
+
+      # we now have correct coordinates for this element
+      $current_row->[$col] = [$rspan, $cspan];
+      $col_spinner[$col] = $rspan - 1;
+
+      # if this is an embedded placment (not a trailing element), use up
+      # the cspan
+      if ($col < $#$current_row) {
+        my $offset = 1;
+        my $row_span = $col_spinner[$col];
+        if ($col + $row_spinner[-1] < $#$current_row &&
+            $row_added && !$rspin_propogate) {
+          # cell is spun out -- clear spinner unless it inherited cspan
+          # from a cell above
+          $row_spinner[-1] = 0;
+        }
+        while ($offset < $cspan) {
+          my $cursor = $col + $offset;
+          $current_row->[$cursor] = 0;
+          $col_spinner[$cursor] = $row_span;
+          ++$offset;
+          if ($col + $offset > $#$current_row) {
+            $row_spinner[-1] = $cspan - $offset;
+            last;
+          }
+        }
+      }
+
+      if ($DEBUG) {
+        foreach my $r (0 .. $#grid) {
+          my $row = $grid[$r];
+          foreach my $c (0 .. $#$row) {
+            if (defined $row->[$c]) {
+              print STDERR $row->[$c] ? 1 : 0;
+            }
+            else {
+              print STDERR '?';
+            }
+          }
+          print STDERR " $row_spinner[$r]\n";
+        }
+        print STDERR "\n";
+        foreach (@col_spinner) {
+          print STDERR defined $_ ? $_ : '?';
+        }
+        print STDERR "\n\n-----\n\n";
+      }
+
+      return \@grid;
+    }
+  }
+
+}
+
+##########
+
+{
+
   package HTML::TableExtract::StripHTML;
 
   use vars qw(@ISA);
@@ -1022,8 +1200,6 @@ sub _emsg {
   use HTML::Parser;
   @ISA = qw(HTML::Parser);
 
-  my %inside;
-
   sub tag {
    my($self, $tag, $num) = @_;
    $self->{_htes_inside}{$tag} += $num;
@@ -1031,7 +1207,7 @@ sub _emsg {
 
   sub text {
     my $self = shift;
-    return if $inside{script} || $inside{style};
+    return if $self->{_htes_inside}{script} || $self->{_htes_inside}{style};
     $self->{_htes_tidbit} .= $_[0];
   }
 
@@ -1048,7 +1224,12 @@ sub _emsg {
     bless $self, $class;
   }
 
-  sub tidbit { shift->{_htes_tidbit} }
+  sub strip {
+    my $self = shift;
+    $self->parse(shift);
+    $self->eof;
+    $self->{_htes_tidbit};
+  }
 
 }
 
@@ -1357,14 +1538,12 @@ extracting into an elment tree structure.
 
 =item strip_html_on_match
 
-When C<keep_html> is enabled, HTML is retained by default during
-attempts at matching header strings. With C<strip_html_on_match>
-enabled, html tags are first stripped from header strings before any
-comparisons are made. (so if C<strip_html_on_match> is not enabled and
-C<keep_html> is, you would have to include potential HTML tags in the
-regexp for header matches). Stripped header tags are replaced with an
-empty string, e.g. 'hot dE<lt>emE<gt>ogE<lt>/emE<gt>' would become 'hot
-dog' before attempting a match.
+When C<keep_html> is enabled, HTML is stripped by default during
+attempts at matching header strings (so if C<strip_html_on_match> is not
+enabled and C<keep_html> is, you would have to include potential HTML
+tags in the regexp for header matches). Stripped header tags are
+replaced with an empty string, e.g. 'hot dE<lt>emE<gt>ogE<lt>/emE<gt>'
+would become 'hot dog' before attempting a match.
 
 =item error_handle
 
@@ -1479,6 +1658,7 @@ Return all rows within a matched table. Each row returned is a reference
 to an array containing the text, HTML, or reference to the HTML::Element
 object of each cell depending the mode of extraction. Tables with
 rowspan or colspan attributes will have some cells containing undef.
+Returns a list or a reference to an array depending on context.
 
 =item columns()
 
@@ -1573,7 +1753,7 @@ HTML::ElementTable objects have their own row(), col(), and cell()
 methods (among others). These are not to be confused with the row() and
 column() methods provided by the HTML::TableExtract::Table objects.
 
-For example, the row() method from HTML::ElmentTable will provide a
+For example, the row() method from HTML::ElementTable will provide a
 reference to a 'glob' of all the elements in that row. Actions (such as
 setting attributes) performed on that row reference will affect all
 elements within that row. On the other hand, the row() method from the
@@ -1594,7 +1774,7 @@ it might be more efficient to access them via the methods provided by
 the HTML::ElementTable object instead. See L<HTML::ElementTable> for
 more information on how to manipulate those objects.
 
-Another option to the cell() method in HTML::TableExtract::Table is the
+An alternative to the cell() method in HTML::TableExtract::Table is the
 space() method. It is largely similar to cell(), except when given
 coordinates of a cell that was covered due to rowspan or colspan
 effects, it will return the contents of the cell that was covering that
@@ -1616,7 +1796,7 @@ Matthew P. Sisk, E<lt>F<sisk@mojotoad.com>E<gt>
 
 =head1 COPYRIGHT
 
-Copyright (c) 2000-2005 Matthew P. Sisk.
+Copyright (c) 2000-2006 Matthew P. Sisk.
 All rights reserved. All wrongs revenged. This program is free
 software; you can redistribute it and/or modify it under the same terms
 as Perl itself.
@@ -1,4 +1,4 @@
 use Test::More;
 eval "use Test::Pod 1.00";
-plan skip_all => "Test::Pod 1.00 required for testing POD" if $@;
+plan skip_all => "Test::Pod 1.00 or greater required for testing POD" if $@;
 all_pod_files_ok();
@@ -1,4 +1,4 @@
 use Test::More;
 eval "use Test::Pod::Coverage 1.00";
 plan skip_all => "Test::Pod::Coverage 1.00 required for testing POD coverage" if $@;
-all_pod_coverage_ok();
+all_pod_coverage_ok({also_private => [qw/TREE/]});
@@ -2,7 +2,7 @@
 
 use strict;
 use lib './lib';
-use Test::More tests => 51;
+use Test::More tests => 52;
 
 use FindBin;
 use lib $FindBin::RealBin;
@@ -10,7 +10,7 @@ use testload;
 
 my $file = "$Dat_Dir/ugly.html";
 
-use HTML::TableExtract;
+BEGIN { require_ok('HTML::TableExtract') }
 
 # by bulk, lineage integrity
 my $label = 'by bulk with lineage check';
@@ -49,5 +49,6 @@ sub alias_test {
   cmp_ok($str, 'eq',  $item2,  "$label (via rows)");
   cmp_ok($str, 'eq',  $cell,   "$label (via cell)");
   cmp_ok($str, 'eq',  $space,  "$label (via space)");
+  no warnings;
   cmp_ok(undef, 'eq', $cellno, "$label (undef via cell)");
 }
@@ -1,7 +1,7 @@
 #!/usr/bin/perl
 
 my $test_count;
-BEGIN { $test_count = 189 }
+BEGIN { $test_count = 126 }
 
 use strict;
 use lib './lib';
@@ -11,7 +11,7 @@ use FindBin;
 use lib $FindBin::RealBin;
 use testload;
 
-my $et_version = '1.13';
+my $et_version = '1.17';
 
 my($tb_present, $et_present);
 eval  "use HTML::TreeBuilder";
@@ -25,17 +25,14 @@ SKIP: {
   skip "HTML::ElementTable $et_version not installed",
        $test_count unless $et_present;
   use_ok("HTML::TableExtract", qw(tree));
-  my $file = "$Dat_Dir/basic.html";
+  my $file = "$Dat_Dir/gnarly.html";
   my $label = 'element table';
-  my $te = HTML::TableExtract->new(
-    depth     => 0,
-    count     => 2,
-  );
+  my $te = HTML::TableExtract->new();
   isa_ok($te, 'HTML::TreeBuilder', "$label - HTML::TableExtract");
   ok($te->parse_file($file), "$label (parse_file)");
   my @tablestates = $te->tables;
   cmp_ok(@tablestates, '==', 1, "$label (extract count)");
-  good_data($_, "$label (data)") foreach @tablestates;
+  good_gnarly_data($_, "$label (data)") foreach @tablestates;
   my $tree = $te->tree;
   ok($tree, 'treetop');
   isa_ok($tree, 'HTML::Element');
@@ -57,14 +54,14 @@ SKIP: {
   # TREE() gets called during header extractions, make sure it does
   $label .= ' (header)';
   $te = HTML::TableExtract->new(
-    headers => [qw(Eight Six Four Two Zero)],
+    headers => [qw{(0,1) [2,4]}],
   );
   ok($te->parse_file($file), "$label (parse_file)");
   $tree = $te->tree;
   ok($tree, 'treetop');
   isa_ok($tree, 'HTML::Element');
   my $table = $te->first_table_found;
-  good_data($table, "$label (data)");
+  good_gnarly_data($table, "$label (data)");
   $tree = $table->tree;
   ok($tree, 'tabletop');
   isa_ok($tree, 'HTML::ElementTable');
@@ -0,0 +1 @@
+<html><head><title>gnarly table</title></head><body><table border=1><tr><td colspan=4 rowspan=1>(0,0) [1,4]</td><td colspan=4 rowspan=2>(0,1) [2,4]</td></tr><tr><td colspan=1 rowspan=2>(1,0) [2,1]</td><td colspan=1 rowspan=1>(1,1) [1,1]</td><td colspan=2 rowspan=1>(1,2) [1,2]</td></tr><tr><td colspan=4 rowspan=2>(2,0) [2,4]</td><td colspan=2 rowspan=2>(2,1) [2,2]</td><td colspan=1 rowspan=1>(2,2) [1,1]</td></tr><tr><td colspan=1 rowspan=1>(3,0) [1,1]</td><td colspan=1 rowspan=1>(3,1) [1,1]</td></tr><tr><td colspan=2 rowspan=3>(4,0) [3,2]</td><td colspan=1 rowspan=1>(4,1) [1,1]</td><td colspan=1 rowspan=3>(4,2) [3,1]</td><td colspan=4 rowspan=4>(4,3) [4,4]</td></tr><tr><td colspan=1 rowspan=1>(5,0) [1,1]</td></tr><tr><td colspan=1 rowspan=1>(6,0) [1,1]</td></tr><tr><td colspan=4 rowspan=1>(7,0) [1,4]</td></tr></table></body></html>
@@ -5,13 +5,16 @@ use Test::More;
 use File::Spec;
 
 use vars qw( @ISA @EXPORT $Dat_Dir
-             @LINEAGE_DATA @HEADERS @SKEW_DATA @TRANSLATION_DATA
+             @LINEAGE_DATA @HEADERS @SKEW_DATA
+             @GNARLY_DATA @TRANSLATION_DATA
            );
 
 require Exporter;
 @ISA = qw(Exporter);
-@EXPORT = qw( $Dat_Dir @LINEAGE_DATA @HEADERS @SKEW_DATA @TRANSLATION_DATA
-              good_data good_slice_data good_skew_data good_sticky_data
+@EXPORT = qw( $Dat_Dir @LINEAGE_DATA @HEADERS @SKEW_DATA
+i             @TRANSLATION_DATA @GNARLY_DATA
+              good_data good_slice_data good_skew_data
+              good_gnarly_data good_sticky_data
             );
 
 my $base_dir;
@@ -79,6 +82,17 @@ $Dat_Dir = $base_dir;
   [ '6,0', '6,0', '6,2', '6,3' ]
 );
 
+@GNARLY_DATA = (
+  [ '(0,0) [1,4]',            '',            '',            '', '(0,1) [2,4]',            '',            '',            '' ],
+  [ '(1,0) [2,1]', '(1,1) [1,1]', '(1,2) [1,2]',            '',            '',            '',            '',            '' ],
+  [            '', '(2,0) [2,4]',            '',            '',            '', '(2,1) [2,2]',            '', '(2,2) [1,1]' ],
+  [ '(3,0) [1,1]',            '',            '',            '',            '',            '',            '', '(3,1) [1,1]' ],
+  [ '(4,0) [3,2]',            '', '(4,1) [1,1]', '(4,2) [3,1]', '(4,3) [4,4]',            '',            '',            '' ],
+  [            '',            '', '(5,0) [1,1]',            '',            '',            '',            '',            '' ],
+  [            '',            '', '(6,0) [1,1]',            '',            '',            '',            '',            '' ],
+  [ '(7,0) [1,4]',            '',            '',            '',            '',            '',            '',            '' ]
+);
+
 sub good_data {
   my($ts, $label, @slice) = @_;
   ref $ts or die "Oops: Table state ref required\n";
@@ -145,8 +159,18 @@ sub good_slice_data {
   }
 }
 
-sub good_skew_data {
-  my($ts, $label, $reverse) = @_;
+sub good_skew_data   {
+  push(@_, 0) if @_ == 2;
+  _good_span_data(@_, \@SKEW_DATA);
+}
+
+sub good_gnarly_data {
+  push(@_, 0) if @_ == 2;
+  _good_span_data(@_, \@GNARLY_DATA);
+}
+
+sub _good_span_data {
+  my($ts, $label, $reverse, $REF_DATA) = @_;
   ref $ts or die "Oops: Table state ref required\n";
   my $t = $ts->{grid};
   foreach my $r (1 .. $#$t) {
@@ -154,9 +178,9 @@ sub good_skew_data {
     my @cols = 0 .. $#$row;
     @cols = reverse @cols if $reverse;
     foreach my $c (@cols) {
-      my $txt = ref $row->[$c] eq 'SCALAR' ?
-        ${$row->[$c]} : $row->[$c]->as_text;
-      cmp_ok($txt, 'eq', $SKEW_DATA[$r][$c], $label);
+      my $txt = ref $row->[$c] eq 'SCALAR' ?  ${$row->[$c]} : $row->[$c]->as_text;
+      $txt = '' unless defined $txt;
+      cmp_ok($txt, 'eq', $REF_DATA->[$r][$c], $label);
     }
   }
   1;
@@ -166,7 +190,7 @@ sub good_sticky_data {
   # testing grid aliasing
   my($ts, $label, $reverse) = @_;
   ref $ts or die "Oops: Table state ref required\n";
-  my $t = $ts->{gridalias};
+  my $t = $ts->_gridalias;
   foreach my $r (0 .. $#$t) {
     my $row = $t->[$r];
     my @cols = 0 .. $#$row;