#!/usr/bin/perl -w -s
use Lingua::PT::PLN;
use Data::Dumper;
use vars qw{$noimg $tag $latin1};
my @breakby=qw(table tr td p br h1 h2 h3 h4 h5 h6 li ul ol dl dt dd
div blockquote hr address);
my @removtag=qw(body html font a b i tt small);
my @remov=qw(head meta);
my %oco=();
if(not $tag) { $tag="p" }
if($noimg) { push (@removtag, "img"); }
if($latin1) { $ARGV[0] = "recode -f html..latin1 < $ARGV[0] |" or die;}
my $patremovtag = '</?(?:'. join('|', @removtag) .')\b[^>]*>';
my $patremov = '<('. join('|', @remov) .')\b[^>]*>(.|\n)*?</\1>';
# my $patsep = '\s*(?:</?(?:'. join('|', @breakby) .')\b[^>]*>\s*)+';
my $patsep = '(?:</?(?:'. join('|', @breakby) .')\b[^>]*>)';
undef $/;
$_= <>;
s#$patremovtag##ig;
s#$patremov##ig;
for(split(/($patsep)/i,$_)){
if(/$patsep/){$oco{$&} ++ ;}
else { my @l = m{([0-9]+|[\@:;!?\%=+*\\\/]|\.+)}g;
for(@l){$oco{$_}++}}
# print join("\n",@l),"\n";
}
print join(",\n", map{ "'$_' => $oco{$_}" } sort keys %oco);
__END__
=head1 NAME
html2p - html to list od C<P>
=head1 SYNOPSIS
html2p [-noimg] [-latin1] file
=head1 DESCRIPTION
C<html2p> makes a html page with "<p>" with the independent text segments after
dividing it in sentences.
It was designed to help in the process of aligning texts.
The command C<recode> should be installed in order to be possible to use
C<-latin1> option.
=head1 AUTHOR
J.Joao Almeida, jj@di.uminho.pt
=head1 SEE ALSO
perl(1).
=cut