The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
Changes 016
MANIFEST 110
META.yml 022
Makefile.PL 1432
Strip.pm 60115
Strip.xs 333
strip_html.c 1954
strip_html.h 04
t/auto-reset.t 020
t/basic.t 067
t/comment.t 038
t/edge-case.t 020
t/filter.t 018
t/mathematical-comparisons.t 017
t/offbyone.t 08
t/striptags.t 031
t/whitespace-single-char.t 010
test.pl 910
18 files changed (This is a version diff) 188515
@@ -1,5 +1,21 @@
 Revision history for Perl extension HTML::Strip.
 
+1.07  Tue Sep 23 14:44:08 UTC 2014
+    - fix to bug RT#19036 - tags not replaced with spaces when only a single
+      character is between the tags
+    - fix to bug RT#35345 - mathematical conparisons within <script> tags
+      misunderstood
+    (patches contributed by Adriano Ferreira)
+    - Exporter was never needed
+    - Allow other filtering operations than just decoding of HTML entities
+    - Modernised test suite
+    - Adds 'auto_reset' attribute, which allows automagic use of $hs->eof
+    - fixes quotes in html comments (RT#32355)
+    (patch contributed by rurban@cpan.org)
+    - MSVC doesnt define strcasecmp, use stricmp instead
+    (patch contributed by Damyan Ivanov)
+    - fixes POD errors
+
 1.06  Fri Feb 10 11:18:35 2006
 	- documented 'set_decode_entities' method
 
@@ -1,10 +1,19 @@
 Changes
 Makefile.PL
 MANIFEST
+META.yml
 README
 Strip.pm
 Strip.xs
 strip_html.h
 strip_html.c
 typemap
-test.pl
+t/basic.t
+t/auto-reset.t
+t/comment.t
+t/edge-case.t
+t/filter.t
+t/mathematical-comparisons.t
+t/offbyone.t
+t/striptags.t
+t/whitespace-single-char.t
@@ -0,0 +1,22 @@
+---
+abstract: 'Perl extension for stripping HTML markup from text.'
+author:
+  - 'Alex Bowley <kilinrax@cpan.org>'
+build_requires:
+  ExtUtils::MakeMaker: 0
+configure_requires:
+  ExtUtils::MakeMaker: 0
+distribution_type: module
+dynamic_config: 0
+generated_by: 'ExtUtils::MakeMaker version 6.57_05'
+license: unknown
+meta-spec:
+  url: http://module-build.sourceforge.net/META-spec-v1.4.html
+  version: 1.4
+name: HTML-Strip
+no_index:
+  directory:
+    - t
+    - inc
+requires: {}
+version: 1.07
@@ -1,18 +1,36 @@
+
+use 5.006;
 use ExtUtils::MakeMaker;
+my $EUMM_VERSION = eval $ExtUtils::MakeMaker::VERSION;
 
-# See lib/ExtUtils/MakeMaker.pm for details of how to influence
-# the contents of the Makefile that is written.
 WriteMakefile(
-    'NAME'		=> 'HTML::Strip',
-    'VERSION_FROM'	=> 'Strip.pm', # finds $VERSION
-    'PREREQ_PM'		=> {}, # e.g., Module::Name => 1.1
-    ($] >= 5.005 ?    ## Add these new keywords supported since 5.005
-      (ABSTRACT_FROM => 'Strip.pm', # retrieve abstract from module
-       AUTHOR     => 'Alex Bowley <kilinrax@cpan.org>') : ()),
-    'LIBS'		=> [''], # e.g., '-lm'
-    'DEFINE'		=> '', # e.g., '-DHAVE_SOMETHING'
-	# Insert -I. if you add *.h files later:
-    'INC'		=> '', # e.g., '-I/usr/include/other'
-	# Un-comment this if you add C files to link with later:
-    'OBJECT'		=> '$(O_FILES)', # link all the C files too
+    'NAME'              => 'HTML::Strip',
+    'VERSION_FROM'      => 'Strip.pm',
+    'PREREQ_PM'         => {
+        # core modules
+        'warnings'      => 0,
+        'strict'        => 0,
+        'Carp'          => 0,
+        'Exporter'      => 0,
+        'DynaLoader'    => 0,
+
+        # build requires
+        'Test::More'    => 0,
+    },
+  ( $] >= 5.005 ? (
+    ABSTRACT_FROM       => 'Strip.pm',
+    AUTHOR              => 'Alex Bowley <kilinrax@cpan.org>'
+  ) : () ),
+  ( $EUMM_VERSION >= 6.46 ? (
+    LICENSE             => 'perl',
+    META_MERGE => {
+        recommended => {
+            'HTML::Entities' => 0,
+        },
+    },
+  ) : () ),
+    'LIBS'              => [''], # e.g., '-lm'
+    'DEFINE'            => '', # e.g., '-DHAVE_SOMETHING'
+    'INC'               => '', # e.g., '-I/usr/include/other'
+    'OBJECT'            => '$(O_FILES)', # link all the C files too
 );
@@ -1,85 +1,108 @@
 package HTML::Strip;
 
+require DynaLoader;
+our @ISA = qw(DynaLoader);
+our $VERSION = '1.07';
+bootstrap HTML::Strip $VERSION;
+
 use 5.006;
 use warnings;
 use strict;
 
-use Carp qw( carp croak );
-
-require Exporter;
-require DynaLoader;
-
-our @ISA = qw(Exporter DynaLoader);
-
-# Items to export into callers namespace by default. Note: do not export
-# names by default without a very good reason. Use EXPORT_OK instead.
-# Do not simply export all your public functions/methods/constants.
+use Carp;
 
-# This allows declaration	use HTML::Strip ':all';
-# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
-# will save memory.
-our %EXPORT_TAGS = ( 'all' => [ qw(
-                                  ) ] );
+my $_html_entities_p = eval { require HTML::Entities; 1 };
 
-our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
+my %defaults = (
+    striptags => [qw( title
+                      style
+                      script
+                      applet )],
+    emit_spaces	    => 1,
+    decode_entities	=> 1,
+    filter          => $_html_entities_p ? 'filter_entities' : undef,
+    auto_reset      => 0,
+    debug           => 0,
+);
 
-our @EXPORT = qw();
+sub new {
+    my $class = shift;
+    my $obj = create();
+    bless $obj, $class;
+
+    my %args = (%defaults, @_);
+    while( my ($key, $value) = each %args ) {
+        my $method = "set_${key}";
+        if( $obj->can($method) ) {
+            $obj->$method($value);
+        } else {
+            Carp::carp "Invalid setting '$key'";
+        }
+    }
+    return $obj;
+}
 
-our $VERSION = '1.06';
+sub set_striptags {
+    my ($self, @tags) = @_;
+    if( ref($tags[0]) eq 'ARRAY' ) {
+        $self->set_striptags_ref( $tags[0] );
+    } else {
+        $self->set_striptags_ref( \@tags );
+    }
+}
 
-bootstrap HTML::Strip $VERSION;
+{
+    # an inside-out object approach
+    # for the 'filter' attribute
+    my %filter_of;
 
-# Preloaded methods go here.
+    sub set_filter {
+        my ($self, $filter) = @_;
+        $filter_of{0+$self} = $filter;
+    }
 
-my $_html_entities_p = eval 'require HTML::Entities';
+    sub filter {
+        my $self = shift;
+        return $filter_of{0+$self}
+    }
 
-my %defaults = (
-                striptags	=> [qw( title
-                                        style
-                                        script
-                                        applet )],
-                emit_spaces	=> 1,
-                decode_entities	=> 1,
-               );
+    sub DESTROY {
+        my $self = shift;
+        delete $filter_of{0+$self};
+    }
+}
 
-sub new {
-  my $class = shift;
-  my $obj = create();
-  bless $obj, $class;
-
-  my %args = (%defaults, @_);
-  while( my ($key, $value) = each %args ) {
-    my $method = "set_${key}";
-    if( $obj->can($method) ) {
-      $obj->$method($value);
-    } else {
-      carp "Invalid setting '$key'";
+# $decoded_string = $self->filter_entities( $string )
+sub filter_entities {
+    my $self = shift;
+    if( $self->decode_entities ) {
+        return HTML::Entities::decode($_[0]);
     }
-  }
-  return $obj;
+    return $_[0];
 }
 
-sub set_striptags {
-  my ($self, @tags) = @_;
-  if( ref($tags[0]) eq 'ARRAY' ) {
-    $self->set_striptags_ref( $tags[0] );
-  } else {
-    $self->set_striptags_ref( \@tags );
-  }
+sub _do_filter {
+    my $self = shift;
+    my $filter = $self->filter;
+    # no filter: return immediately
+    return $_[0] unless defined $filter;
+
+    if ( !ref $filter ) { # method name
+        return $self->$filter( @_ );
+    } else { # code ref
+        return $filter->( @_ );
+    }
 }
 
 sub parse {
-  my ($self, $text) = @_;
-  my $stripped = $self->strip_html( $text );
-  if( $self->decode_entities && $_html_entities_p ) {
-    $stripped = HTML::Entities::decode($stripped);
-  }
-  return $stripped;
+    my ($self, $text) = @_;
+    my $stripped = $self->strip_html( $text );
+    return $self->_do_filter( $stripped );
 }
 
 sub eof {
-  my $self = shift;
-  $self->reset();
+    my $self = shift;
+    $self->reset();
 }
 
 1;
@@ -136,7 +159,9 @@ If the tag starts with an exclamation mark, it is assumed to be a
 declaration or a comment. Within such tags, C<E<gt>> characters do not
 end the tag if they appear within pairs of double dashes (e.g. C<E<lt>!--
 E<lt>a href="old.htm"E<gt>old pageE<lt>/aE<gt> --E<gt>> would be
-stripped completely).
+stripped completely). Inside a comment, no parsing for quotes
+is done as well. (That means C<E<lt>!-- comment with ' quote " --E<gt>>
+are entirely stripped.)
 
 =back
 
@@ -154,10 +179,15 @@ quote, comment, or whatever; it will remember this, and expect the
 next call to parse to start with the remains of said tag.
 
 If this is not going to be the case, be sure to call $hs->eof()
-between calls to $hs->parse().
+between calls to $hs->parse(). Alternatively, you may
+set C<auto_reset> to true on the constructor or any time
+after with C<set_auto_reset>, so that the parser will always
+operate in one-shot basis (resetting after each parsed chunk).
 
 =head2 METHODS
 
+=over
+
 =item new()
 
 Constructor. Can optionally take a hash of settings (with keys
@@ -201,6 +231,29 @@ any conversion of tags into spaces. Set to true by default.
 Takes a boolean value. If set to false, HTML::Strip will decode HTML
 entities. Set to true by default.
 
+=item filter_entities()
+
+If HTML::Entities is available, this method behaves just
+like invoking HTML::Entities::decode_entities, except that
+it respects the current setting of 'decode_entities'.
+
+=item set_filter()
+
+Sets a filter to be applied after tags were stripped.
+It may accept the name of a method (like 'filter_entities')
+or a code ref. By default, its value is 'filter_entities'
+if HTML::Entities is available or C<undef> otherwise.
+
+=item set_auto_reset()
+
+Takes a boolean value. If set to true, C<parse> resets after
+each call (equivalent to calling C<eof>). Otherwise, the
+parser remembers its state from one call to C<parse> to
+another, until you call C<eof> explicitly. Set to false
+by default.
+
+=back
+
 =head2 LIMITATIONS
 
 =over 4
@@ -227,6 +280,8 @@ excess whitespace (for example, using C<tr/ / /s;>).
 HTML::Strip will only attempt decoding of HTML entities if
 L<HTML::Entities> is installed.
 
+=back
+
 =head2 EXPORT
 
 None by default.
@@ -5,7 +5,7 @@
 
 #include "strip_html.h"
 
-MODULE = HTML::Strip		PACKAGE = HTML::Strip		
+MODULE = HTML::Strip		PACKAGE = HTML::Strip
 
 PROTOTYPES: ENABLE
 
@@ -39,7 +39,7 @@ strip_html( stripper, raw )
   strip_html( stripper, raw, clean );
   RETVAL = clean;
  OUTPUT:
-  RETVAL  
+  RETVAL
  CLEANUP:
   Safefree( clean );
 
@@ -88,7 +88,7 @@ set_striptags_ref( stripper, tagref )
   int n;
   if( (SvROK(tagref)) &&
       (SvTYPE(SvRV(tagref)) == SVt_PVAV) ) {
-    tags = (AV *) SvRV(tagref);  
+    tags = (AV *) SvRV(tagref);
   } else {
     XSRETURN_UNDEF;
   }
@@ -103,3 +103,33 @@ set_striptags_ref( stripper, tagref )
      char * tag = SvPV(*av_fetch(tags, n, 0), l);
      add_striptag( stripper, tag );
   }
+
+void
+set_auto_reset( stripper, value )
+  Stripper * stripper
+  int value
+ CODE:
+  stripper->o_auto_reset = value;
+
+int
+auto_reset( stripper )
+  Stripper * stripper
+ CODE:
+  RETVAL = stripper->o_auto_reset;
+ OUTPUT:
+  RETVAL
+
+void
+set_debug( stripper, value )
+  Stripper * stripper
+  int value
+ CODE:
+  stripper->o_debug = value;
+
+int
+debug( stripper )
+  Stripper * stripper
+ CODE:
+  RETVAL = stripper->o_debug;
+ OUTPUT:
+  RETVAL
@@ -1,17 +1,33 @@
-
 #include <stdio.h>
 #include <ctype.h>
 #include <string.h>
 #include "strip_html.h"
 
-
 void
 strip_html( Stripper * stripper, const char * raw, char * output ) {
   const char * p_raw = raw;
   const char * raw_end = raw + strlen(raw);
   char * p_output = output;
-    
+
   while( p_raw < raw_end ) {
+    if( stripper->o_debug ) {
+      printf( "[DEBUG] char %c state %c %c %c tag:%5s, %c %c %c %c, %c %c %c %c:%c, ",
+        *p_raw,
+        (stripper->f_closing ? 'C' : ' '),
+        (stripper->f_in_tag ? 'T' : ' '),
+        (stripper->f_full_tagname ? 'F' : ' '),
+        stripper->tagname,
+        (stripper->f_just_seen_tag ? 'J' : ' '),
+        (stripper->f_outputted_space ? 'S' : ' '),
+        (stripper->f_lastchar_slash ? '/' : ' '),
+        (stripper->f_lastchar_minus ? '-' : ' '),
+        (stripper->f_in_decl ? 'D' : ' '),
+        (stripper->f_in_comment ? 'C' : ' '),
+        (stripper->f_in_striptag ? 'X' : ' '),
+        (stripper->f_in_quote ? 'Q' : ' '),
+        (stripper->quote ? stripper->quote : ' ')
+      );
+    }
     if( stripper->f_in_tag ) {
       /* inside a tag */
       /* check if we know either the tagname, or that we're in a declaration */
@@ -23,7 +39,8 @@ strip_html( Stripper * stripper, const char * raw, char * output ) {
         /* then check if the first character is a '/', in which case, this is a closing tag */
         else if( stripper->p_tagname == stripper->tagname && *p_raw == '/' ) {
           stripper->f_closing = 1;
-        } else {
+        /* we only care about closing tags within a stripped tags block (e.g. scripts) */
+        } else if( !stripper->f_in_striptag || stripper->f_closing ) {
           /* if we don't have the full tag name yet, add current character unless it's whitespace, a '/', or a '>';
              otherwise null pad the string and set the full tagname flag, and check the tagname against stripped ones.
              also sanity check we haven't reached the array bounds, and truncate the tagname here if we have */
@@ -41,7 +58,7 @@ strip_html( Stripper * stripper, const char * raw, char * output ) {
               /* if we're outside a stripped tag block, check tagname against stripped tag list */
             } else if( !stripper->f_in_striptag && !stripper->f_closing ) {
               int i;
-              for( i = 0; i <= stripper->numstriptags; i++ ) {
+              for( i = 0; i < stripper->numstriptags; i++ ) {
                 if( strcasecmp( stripper->tagname, stripper->o_striptags[i] ) == 0 ) {
                   stripper->f_in_striptag = 1;
                   strcpy( stripper->striptag, stripper->tagname );
@@ -61,8 +78,9 @@ strip_html( Stripper * stripper, const char * raw, char * output ) {
           }
         } else {
           /* not in a quote */
-          /* check for quote characters */
-          if( *p_raw == '\'' || *p_raw == '\"' ) {
+          /* check for quote characters, but not in a comment */
+          if( !stripper->f_in_comment &&
+              ( *p_raw == '\'' || *p_raw == '\"' ) ) {
             stripper->f_in_quote = 1;
             stripper->quote = *p_raw;
             /* reset lastchar_* flags in case we have something perverse like '-"' or '/"' */
@@ -112,30 +130,43 @@ strip_html( Stripper * stripper, const char * raw, char * output ) {
             /* output a space in place of tags we have previously parsed,
                and set a flag so we only do this once for every group of tags.
                done here to prevent unnecessary trailing spaces */
-            if( isspace(*p_raw) ) {
+            if( !isspace(*p_raw) &&
               /* don't output a space if this character is one anyway */
-              stripper->f_outputted_space = 1;
-            } else {
-              if( !stripper->f_outputted_space &&
-                  stripper->f_just_seen_tag ) {
-                *p_output++ = ' ';
-                stripper->f_outputted_space = 1;
-              } else {
-                /* this character must not be a space */
-                stripper->f_outputted_space = 0;
+                !stripper->f_outputted_space &&
+                stripper->f_just_seen_tag ) {
+              if( stripper->o_debug ) {
+                printf("SPACE ");
               }
+              *p_output++ = ' ';
+              stripper->f_outputted_space = 1;
             }
           }
+          if( stripper->o_debug ) {
+            printf("CHAR %c", *p_raw);
+          }
           *p_output++ = *p_raw;
           /* reset 'just seen tag' flag */
           stripper->f_just_seen_tag = 0;
+          /* reset 'outputted space' flag if character is not one */
+          if (!isspace(*p_raw)) {
+            stripper->f_outputted_space = 0;
+          } else {
+            stripper->f_outputted_space = 1;
+          }
         }
       }
     } /* in tag check */
     p_raw++;
+    if( stripper->o_debug ) {
+      printf("\n");
+    }
   } /* while loop */
 
   *p_output = 0;
+
+  if (stripper->o_auto_reset) {
+    reset( stripper );
+  }
 }
 
 void
@@ -147,14 +178,15 @@ reset( Stripper * stripper ) {
   /* hack to stop a space being output on strings starting with a tag */
   stripper->f_outputted_space = 1;
   stripper->f_just_seen_tag = 0;
-    
+
   stripper->f_in_quote = 0;
 
   stripper->f_in_decl = 0;
   stripper->f_in_comment = 0;
   stripper->f_lastchar_minus = 0;
-    
+
   stripper->f_in_striptag = 0;
+
 }
 
 void
@@ -172,6 +204,9 @@ add_striptag( Stripper * stripper, char * striptag ) {
   }
 }
 
+#ifdef _MSC_VER
+#define strcasecmp(a,b) stricmp(a,b)
+#endif
 
 void
 check_end( Stripper * stripper, char end ) {
@@ -27,6 +27,10 @@ typedef struct Stripper {
   int numstriptags;
   int o_emit_spaces;
   int o_decode_entities;
+
+  int o_auto_reset;
+
+  int o_debug;
 } Stripper;
 
 void strip_html( Stripper * stripper, const char * raw, char * clean );
@@ -0,0 +1,20 @@
+
+use Test::More tests => 5;
+
+BEGIN { use_ok 'HTML::Strip' }
+
+{
+  my $hs = HTML::Strip->new; # auto_reset off by default
+  my $o = $hs->parse( "<html>\nTitle\n<script>a+b\n" );
+  is( $o, "\nTitle\n" );
+  my $o2 = $hs->parse( "c+d\n</script>\nEnd\n</html>" );
+  is( $o2, "\nEnd\n" );
+}
+
+{
+  my $hs = HTML::Strip->new( auto_reset => 1 ); # auto_reset on
+  my $o = $hs->parse( "<html>\nTitle\n<script>a+b\n" );
+  is( $o, "\nTitle\n" );
+  my $o2 = $hs->parse( "c+d\n</script>\nEnd\n</html>" );
+  is( $o2, "c+d\n\nEnd\n" );
+}
@@ -0,0 +1,67 @@
+use Test::More tests => 19;
+
+BEGIN { use_ok 'HTML::Strip' }
+
+{
+  my $hs = new HTML::Strip;
+
+  is( $hs->parse( 'test' ), 'test', 'works with plain text' );
+  $hs->eof;
+
+  is( $hs->parse( '<em>test</em>' ), 'test', 'works with <em>|</em> tags' );
+  $hs->eof;
+
+  is( $hs->parse( 'foo<br>bar' ), 'foo bar', 'works with <br> tag' );
+  $hs->eof;
+
+  is( $hs->parse( '<p align="center">test</p>' ), 'test', 'works with tags with attributes' );
+  $hs->eof;
+
+  is( $hs->parse( '<p align="center>test</p>' ), '', '"works" with non-terminated quotes' );
+  $hs->eof;
+
+  is( $hs->parse( '<foo>bar' ), 'bar', 'strips <foo> tags' );
+  is( $hs->parse( '</foo>baz' ), ' baz', 'strips </foo> tags' );
+  $hs->eof;
+
+  is( $hs->parse( '<!-- <p>foo</p> bar -->baz' ), 'baz', 'strip comments' );
+  $hs->eof;
+
+  is( $hs->parse( '<img src="foo.gif" alt="a > b">bar' ), 'bar', 'works with quote attributes which contain >' );
+  $hs->eof;
+
+  is( $hs->parse( '<script>if (a>b && a<c)</script>bar' ), 'bar', '<script> tag and content are stripped' );
+  $hs->eof;
+
+  is( $hs->parse( '<# just data #>bar' ), 'bar', 'weird tags get stripped' );
+  $hs->eof;
+
+  TODO: {
+    local $TODO = "fix CDATA handling";
+    is( $hs->parse( '<![INCLUDE CDATA [ >>>>>>>>>>>> ]]>bar' ), 'bar', 'character data gets stripped' );
+    $hs->eof;
+  }
+
+  is( $hs->parse( '<script>foo</script>bar' ), 'bar', '<script> nodes are stripped' );
+  $hs->eof;
+
+  my $has_html_entities = eval { require HTML::Entities; 1 };
+  SKIP: {
+    skip 'HTML::Entities not available', 2 unless $has_html_entities;
+
+    is( $hs->parse( '&#060;foo&#062;' ), '<foo>', 'numeric HTML entities are decoded' );
+    $hs->eof;
+    is( $hs->parse( '&lt;foo&gt;' ), '<foo>', 'HTML entities are decoded' );
+    $hs->eof;
+  }
+
+  $hs->set_decode_entities(0);
+  is( $hs->parse( '&#060;foo&#062;' ), '&#060;foo&#062;', 'entities decoding off works' );
+  $hs->eof;
+
+  is( $hs->parse( '&lt;foo&gt;' ), '&lt;foo&gt;', 'entities decoding off works' );
+  $hs->eof;
+
+  is( $hs->parse( '<script>foo</script>bar' ), 'bar', '"script" is a default strip_tag' );
+  $hs->eof;
+}
@@ -0,0 +1,38 @@
+
+# http://rt.cpan.org/Public/Bug/Display.html?id=32355
+
+use Test::More tests => 7;
+
+BEGIN { use_ok 'HTML::Strip' }
+
+# stripping declarations
+{
+  my $hs = HTML::Strip->new();
+  is( $hs->parse( q{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"><html>Text</html>} ),
+      "Text", 'decls are stripped' );
+  $hs->eof;
+}
+
+# stripping comments
+{
+  my $hs = HTML::Strip->new();
+  is( $hs->parse( q{<html><!-- a comment to be stripped -->Hello World!</html>} ),
+      "Hello World!", "comments are stripped" );
+  $hs->eof;
+
+  is( $hs->parse( q{<html><!-- comment with a ' apos -->Hello World!</html>} ), 
+     "Hello World!", q{comments may contain '} );
+  $hs->eof;
+
+  is( $hs->parse( q{<html><!-- comment with a " quote -->Hello World!</html>} ), 
+     "Hello World!", q{comments may contain "} );
+  $hs->eof;
+
+  is( $hs->parse( q{<html><!-- comment -- "quote" >Hello World!</html>} ), 
+     "Hello World!", "weird decls are stripped" );
+  $hs->eof;
+
+  is( $hs->parse( "a<>b" ),
+      "a b", 'edge case with <> ok' );
+
+}
@@ -0,0 +1,20 @@
+use Test::More;
+
+BEGIN { use_ok 'HTML::Strip' }
+
+# test for RT#21008
+
+# stripping comments 
+{
+    my $hs = HTML::Strip->new();
+    is( $hs->parse( "a<>b" ), "a b", 'edge case with <> ok' );
+    $hs->eof;
+    is( $hs->parse( "a<>b c<>d" ), "a b c d", 'edge case with <>s ok' );
+    $hs->eof;
+    is( $hs->parse( "From: <>\n\na. Title: some text\n\nb. etc\n" ), "From: \n\na. Title: some text\n\nb. etc\n", 'test case' ); 
+    is( $hs->parse( "From: <>\n\na. Title: some text\n\nb. etc\n" ), "From: \n\na. Title: some text\n\nb. etc\n", 'test case' ); 
+    $hs->eof; 
+    is( $hs->parse( q{this is an "example" with 'quoted' parts that should not be stripped} ), q{this is an "example" with 'quoted' parts that should not be stripped} ); 
+}
+
+done_testing;
@@ -0,0 +1,18 @@
+use Test::More tests => 3;
+
+BEGIN { use_ok 'HTML::Strip' }
+
+{
+  my $hs = HTML::Strip->new( filter => undef );
+  ok( $hs->parse( '<html>&nbsp;</html>' ), '&nbsp;' );
+  $hs->eof;
+
+}
+
+{
+  my $filter = sub { my $s = shift; $s =~ s/\s/ /g;; $s };
+  my $hs = HTML::Strip->new( filter => $filter );
+  ok( $hs->parse( "<html>title\ntext\ntext</html>" ), 'title text text' );
+  $hs->eof;
+
+}
@@ -0,0 +1,17 @@
+use Test::More tests => 2;
+
+BEGIN { use_ok 'HTML::Strip' }
+
+# test for RT#19036
+{
+    my $hs = HTML::Strip->new();
+    is( $hs->parse( <<EOF ), "\nhello\n", "mathematical comparisons in strip tags big RT#35345" );
+<script>
+function shovelerMain (detectBuyBox) {
+    for (var i = 0; i < Shoveler.Instances.length; i++) {
+...
+</script>
+<h>hello</h>
+EOF
+    $hs->eof;
+}
@@ -0,0 +1,8 @@
+use strict;
+
+# test for RT#94713
+
+my $INC = join ' ', map { "-I$_" } @INC;
+
+exec("MALLOC_OPTIONS=Z perl $INC -MTest::More -MHTML::Strip -e 'is(HTML::Strip->new->parse(q[<li>abc < 0.5 km</li><li>xyz</li>]), q[abc xyz]); done_testing()'");
+
@@ -0,0 +1,31 @@
+use Test::More tests => 6;
+
+BEGIN { use_ok 'HTML::Strip' }
+
+{
+  # set_striptags( \@ARRAY )
+  my $hs = HTML::Strip->new;
+  $hs->set_striptags( [ 'foo' ] );
+
+  is( $hs->parse( '<script>foo</script>bar' ), 'foo bar', 'set_striptags redefinition works' );
+  $hs->eof;
+
+  is( $hs->parse( '<foo>foo</foo>bar' ), 'bar', 'set_striptags redefinition works' );
+  $hs->eof;
+}
+
+{
+  # set_striptags( LIST )
+  my @striptags = qw(baz quux);
+  my $hs = HTML::Strip->new;
+  $hs->set_striptags( @striptags );
+
+  is( $hs->parse( '<baz>fumble</baz>bar<quux>foo</quux>' ), 'bar', 'stripping user-defined tags ok' );
+  $hs->eof;
+
+  is( $hs->parse( '<baz>fumble<quux/>foo</baz>bar' ), 'bar', 'stripping user-defined tags ok' );
+  $hs->eof;
+
+  is( $hs->parse( '<foo> </foo> <bar> baz </bar>' ), '   baz ', 'stripping user-defined tags ok' );
+  $hs->eof;
+}
@@ -0,0 +1,10 @@
+use Test::More tests => 2;
+
+BEGIN { use_ok 'HTML::Strip' }
+
+# test for RT#19036
+{
+    my $hs = HTML::Strip->new();
+    is( $hs->parse( '<tr><td>01 May 2006</td><td>0</td><td>10</td></tr>' ), '01 May 2006 0 10', "whitespace single character bug" );
+    $hs->eof;
+}
@@ -1,91 +0,0 @@
-# Before `make install' is performed this script should be runnable with
-# `make test'. After `make install' it should work as `perl test.pl'
-
-#########################
-
-# change 'tests => 1' to 'tests => last_test_to_print';
-
-use Test;
-BEGIN { plan tests => 17 };
-use HTML::Strip;
-ok(1); # If we made it this far, we're ok.
-
-#########################
-
-# Insert your test code below, the Test module is use()ed here so read
-# its man page ( perldoc Test ) for help writing this test script.
-
-my $hs = new HTML::Strip;
-
-ok( $hs->parse( 'test' ), 'test' );
-$hs->eof;
-
-ok( $hs->parse( '<em>test</em>' ), 'test' );
-$hs->eof;
-
-ok( $hs->parse( 'foo<br>bar' ), 'foo bar' );
-$hs->eof;
-
-ok( $hs->parse( '<p align="center">test</p>' ), 'test' );
-$hs->eof;
-
-ok( $hs->parse( '<p align="center>test</p>' ), '' );
-$hs->eof;
-
-ok( $hs->parse( '<foo>bar' ), 'bar' );
-ok( $hs->parse( '</foo>baz' ), ' baz' );
-$hs->eof;
-
-ok( $hs->parse( '<!-- <p>foo</p> bar -->baz' ), 'baz' );
-$hs->eof;
-
-ok( $hs->parse( '<img src="foo.gif" alt="a > b">bar' ), 'bar' );
-$hs->eof;
-
-ok( $hs->parse( '<script>if (a<b && a>c)</script>bar' ), 'bar' );
-$hs->eof;
-
-ok( $hs->parse( '<# just data #>bar' ), 'bar' );
-$hs->eof;
-
-#ok( $hs->parse( '<![INCLUDE CDATA [ >>>>>>>>>>>> ]]>bar' ), 'bar' );
-#$hs->eof;
-
-ok( $hs->parse( '<script>foo</script>bar' ), 'bar' );
-$hs->eof;
-
-my $html_entities_p = eval 'require HTML::Entities' ? '' : 'HTML::Entities not available';
-skip( $html_entities_p, $hs->parse( '&#060;foo&#062;' ), '<foo>' );
-$hs->eof;
-skip( $html_entities_p, $hs->parse( '&lt;foo&gt;' ), '<foo>' );
-$hs->eof;
-$hs->set_decode_entities(0);
-skip( $html_entities_p, $hs->parse( '&#060;foo&#062;' ), '&#060;foo&#062;' );
-$hs->eof;
-skip( $html_entities_p, $hs->parse( '&lt;foo&gt;' ), '&lt;foo&gt;' );
-$hs->eof;
-
-
-my $hs2 = new HTML::Strip;
-$hs2->set_striptags( [ 'foo' ] );
-
-ok( $hs2->parse( '<script>foo</script>bar' ), 'foo bar' );
-$hs2->eof;
-
-ok( $hs2->parse( '<foo>foo</foo>bar' ), 'bar' );
-$hs2->eof;
-
-ok( $hs->parse( '<script>foo</script>bar' ), 'bar' );
-$hs->eof;
-
-my @striptags = qw(baz quux);
-$hs->set_striptags( @striptags );
-
-ok( $hs->parse( '<baz>fumble</baz>bar<quux>foo</quux>' ), 'bar' );
-$hs->eof;
-
-ok( $hs->parse( '<baz>fumble<quux/>foo</baz>bar' ), 'bar' );
-$hs->eof;
-
-ok( $hs->parse( '<foo> </foo> <bar> baz </bar>' ), '   baz ' );
-$hs->eof;