The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -w -s
  use Lingua::PT::PLN;
  use Data::Dumper;

  use vars qw{$noimg $tag $latin1};

  my  @breakby=qw(table tr td p br h1 h2 h3 h4 h5 h6 li ul ol dl dt dd 
    div blockquote hr address);
  my @removtag=qw(body html font a b i tt small);
  my @remov=qw(head meta);
  my %oco=();

  if(not $tag) { $tag="p" }
  if($noimg)   { push (@removtag, "img"); }
  if($latin1)  { $ARGV[0] = "recode -f html..latin1 < $ARGV[0] |"  or die;}

  my $patremovtag = '</?(?:'. join('|', @removtag) .')\b[^>]*>';
  my $patremov = '<('. join('|', @remov) .')\b[^>]*>(.|\n)*?</\1>';
#  my $patsep = '\s*(?:</?(?:'. join('|', @breakby) .')\b[^>]*>\s*)+';
  my $patsep = '(?:</?(?:'. join('|', @breakby) .')\b[^>]*>)';

  undef $/;
  $_= <>;
  s#$patremovtag##ig;
  s#$patremov##ig;

  for(split(/($patsep)/i,$_)){
     if(/$patsep/){$oco{$&} ++ ;}
     else { my @l = m{([0-9]+|[\@:;!?\%=+*\\\/]|\.+)}g; 
           for(@l){$oco{$_}++}}
#     print join("\n",@l),"\n";
  }
  print join(",\n", map{ "'$_' => $oco{$_}" } sort keys %oco);

__END__

=head1 NAME

html2p - html to list od C<P>

=head1 SYNOPSIS

  html2p [-noimg] [-latin1] file

=head1 DESCRIPTION

C<html2p> makes a html page with "<p>" with the independent text segments after
dividing it in sentences.

It was designed to help in the process of aligning texts.

The command C<recode> should be installed in order to be possible to use 
C<-latin1> option.

=head1 AUTHOR

J.Joao Almeida, jj@di.uminho.pt

=head1 SEE ALSO

perl(1).

=cut