The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -w -s
use Data::Dumper;
use analiza;
use strict;
use locale;
use Lingua::PT::PLN;
use Lingua::PT::PLN::tokenizer;
my $tab = $Lingua::PT::PLN::tokenizer::tab;
use jspell;

our ($nopn);

jspell_dict("port");

my %savit_p;
my $savit_n;
my %reap = ();
my %subs = ();
my $i=0;
my $r;
my $wp='\w+(?:-\w+)*';  #pattern for word
my %validas;

if($nopn){open(F,">output1") or die;
  while(<>){print F $_}
  close F;
}
else {tagPN("output1"); }

open( I, "output1") or die;
open( X, "|jspell -d port -W 0 -J -g -a -w \"'\"> output2") or die;

while(<I>) {
  for my $rule (@$tab) {
     s{(^|\s|\b|["'«»])($rule->[0])}{ $1 . sav("$2","lex($2,$rule->[1])") }ge
  }

  print X tokenize($_);
}

close I;
close X;



open(X,"output2") or die;
while(<X>){
  s{\&\s+(.*?) 0 :}{procDesc($1)}ge;
  s{\*\s+(.*?) 0 :}{$1	}g;
  s{#n(\d+)}{$subs{$1} || "???$1"}eg;
  print;
}
close X;



sub procDesc{
  my $p = shift ;
  my @as = feaWithNewFlags($p);
  if(@as) {
      "$p\t".join(",", map {any2lex($_,$_->{rad})} @as);
  } else {
    my @an = analiza::analiza($p);
    if (@an) {
      "$p\t".join(",", map {any2lex($_,$_->{rad})} @an);
    } else {
      "$p\tNOK";
    }
  }
}

sub tokenize{
  my $abrev = join '|', qw( srt?a? dra? [A-Z] etc exa? jr profs? arq av estr?
			  et al vol eng tv lgo pr Oliv ig mrs? min rep );
  my $protect = '
       \#n\d+
    | \w+\'\w+
    | \w+-\w+
    |  [\w_.-]+ \@ [\w_.-]+                      # emails
    |  \w+\.[ºª]                                 # ordinals
    |  <[^>]*>                                   # marcup XML SGML
    |  \d+(?:\.\d+)+                             # numbers
    |  \d+\:\d+                                  # the time
    |  ((https?|ftp|gopher)://|www)[\w_./~-]+    # urls
';
  my $conf = { rs => "\n" };
  my $text = shift;
  if (ref($text) eq "HASH") {
    $conf = { %$conf, %$text};
    $text = shift;
  }
  my $result = "";
  local $/ = ">";
  my %tag=();
  my ($a,$b);
  for ($text) {
    if(/<(\w+)(.*?)>/)
      { ($a, $b) = ($1,$2);
	if ($b =~ /=/ )  { $tag{'v'}{$a}++ }
	else             { $tag{'s'}{$a}++ }
      }
    s/<\?xml.*?\?>//s;
    s/(.)\n-\n/$1-/g;
    s/($protect)/savit($1)/xge;
    s/([\»\]])/$1 /g;
    s#([\«\[])# $1#g;
    s#\"# \" #g;
    s/(\s*\b\s*|\s+)/\n/g;
    s/\n+/\n/g;
    s/\n(\.?[ºª])\b/$1/g;
    while ( s#\b([0-9]+)\n([\,.])\n([0-9]+\n)#$1$2$3#g ){};
    s#\n($abrev)\n\.\n#\n$1\.\n#ig;
    s/\n*</\n</;
    $_ = loadit($_);
    s/(\s*\n)+$/\n/;
    s/^(\s*\n)+//;
    $result.=$_;
  }

  $result =~ s/\n/$conf->{rs}/g;

  $result;
}

sub savit{
  my $a=shift;
  $savit_p{++$savit_n}=$a ;
  " __MARCA__$savit_n "
}

sub loadit{
  my $a = shift;
  $a =~ s/ ?__MARCA__(\d+) ?/$savit_p{$1}/g;
  $savit_n = 0;
  $a;
}

sub tagPN{
 my $out = shift || "output1";
 forPN( {t => "double", out => $out } ,
   sub {my ($p, $contex)=@_;
      if($p =~ m/($wp)\b(\s*)(.*)/) {
        my ($a,$b,$c)=($1,$2,$3);
        if (vazia($a)){ if($c) { "$a$b". savPN($c, "lex($c, [CAT=np, unknown=guessed])") }
                        else   { $p } }
        else { savPN($p,"lex($p, [CAT=np,unknown=guessed])") }
      }
      else { $p }
   },
   sub {my ($p, $contex)=@_;
      if($p =~ m/($wp)\b(\s*)(.*)/) {
        my ($a,$b,$c)=($1,$2,$3);
        if (!possibleNP($a)){ 
            if($c) { "$a$b".savPN( $c ,"lex($c, [CAT=np,unknown=guessed])") }
            else   { $p } }
        else { savPN($p,"lex($p, [CAT=np,unknown=guessed])") }
      }
      else { $p }
   } 
 );
}

sub savPN{
 my ($k,$inf)=@_;
 my $r = "UNDEF";
 my @a ;
 if($k =~ /^($wp)/) {
     my $x = $1;
     @a = fea($x);
     my $fs = +{ onethat({CAT=>"np"},@a) };
     if(defined $fs->{CAT}) { 
        delete $fs->{rad};
	$fs->{unknown}="guessed" if length($k) != length($x);
        $r = any2lex( $fs, $k);  
     }
     else{
        $fs = +{ onethat({CAT=>"nc"},@a) };
        if(defined $fs->{CAT}) { 
	    $fs->{unknown}="guessed";
            delete $fs->{rad};
            $fs = { CAT=> "np",
                    (defined $fs->{G}) ? (G => $fs->{G}) : (),
                    (defined $fs->{N}) ? (N => $fs->{N}) : ()};
            $r=any2lex( $fs, $k) ;  }
        else { $r = any2lex($inf,$k) }
     } 
 }
 else {  $r=any2lex($inf,$k)}
 sav( $k,$r) 
}

sub any2lex{
  my $x = shift;
  my $o = shift||" ";
  my $aux= any2str($x,"compact");
  $aux =~ s/\{(.*?)\}/lex($o,[$1])/g;
  $aux
}

sub sav {
    my ($k,$inf)=@_;
    if (exists($reap{$k})) { "#n$reap{$k}" } 
 else                   { $subs{$i}="$k\t$inf";
                          $reap{$k}=$i;
                          $r="#n$i";
                          $i++;
                          $r }
}

sub possibleNP{
  my $w = shift; 
  !(rad($w)) or (ok({CAT=>"np"},fea($w))) }

sub vazia{
 my $w = shift;
 my $CATs= join(" ",map {$_->{CAT}||""} fea($w));
 return ( $CATs =~ m!\b(adv|p|art|con|cp)!)?1:0
}


sub feaWithNewFlags{
  my $w = shift;
  setmode("+flags");
  setmode("+af");
  setmode("+nm");
  my @fs=fea($w);
  #  print Dumper(\@fs);
  my @r =  map { validaCatFlag($_)? ($_): () } @fs;
}

sub validaCatFlag{
  my $f=shift;
  my $flag=$f->{flags};
  my @fs=fea($f->{rad});
  #  print Dumper(\@fs);
  my @comp = map { (defined($_->{CAT}) && $validas{$flag}{$_->{CAT}})
                   ? (1) : () } @fs ;
  scalar(@comp)
}

BEGIN{
%validas=( 
            'A' => { adj => 1 , a_nc => 1,},
            'd' => { adj => 1 , a_nc => 1,},
            'f' => { adj => 1 , a_nc => 1,},
            'j' => { adj => 1 , a_nc => 1,},
            'm' => { adj => 1 , a_nc => 1,},
            's' => { adj => 1 , a_nc => 1,},
            'U' => { adj => 1 , a_nc => 1,},
            'F' => { adj => 1 , a_nc => 1,},
            'I' => { adj => 1 , a_nc => 1,},
            'T' => { adj => 1 , a_nc => 1,},
            'U' => { adj => 1 , a_nc => 1,},
            'a' => { nc =>1, adj => 1 , a_nc => 1,},
            'h' => { nc =>1, adj => 1 , a_nc => 1,},
            'p' => { nc =>1, adj => 1 , a_nc => 1,},
            'i' => { nc =>1, a_nc => 1,},
            't' => { nc =>1, a_nc => 1,},
            'u' => { nc =>1, a_nc => 1,},
            'w' => { nc =>1, a_nc => 1,},
            'y' => { nc =>1, a_nc => 1,},
            'C' => { v => 1 },
            'c' => { v => 1 },
            'D' => { v => 1 },
            'M' => { v => 1 },
            'n' => { v => 1 },
            'o' => { v => 1 },
            'v' => { v => 1 },
            'L' => { v => 1 },
            'P' => { v => 1 },
            'G' => { nc => 1, adj => 1 , a_nc => 1,},
          );
}