The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -s

use strict;
use warnings;

use Lingua::NATools::Lexicon;
use Lingua::NATools::Dict;

my $MAXENTRY = 8;

our ($h);

sub usage {
  print "nat-addDict: adds a dictionary in Perl Dumper format into a NATools corpus\n\n";
  print "\tnat-addDict <natDir> <dumperFile>\n\n";
  print "\tnat-addDict <natDir> <source-target.dmp> <target-source.dmp>\n\n";
  print "For more help, please run 'perldoc nat-addDict'\n";
  exit;
}

usage() if ($h);
my $selfdir = shift @ARGV;
my $dumper  = shift @ARGV;
my $dumper2 = shift @ARGV || undef;

usage() unless -d $selfdir && -f $dumper;

print STDERR "step 1: load Data::Dumper file\n";

our ($DIC1, $DIC2);

if ($dumper2) {
  $DIC1 = do $dumper;
  $DIC2 = do $dumper2;
} else {
  do $dumper;
}
my $s1 = keys %$DIC1;
my $s2 = keys %$DIC2;
printf STDERR "        size 1: $s1\n";
printf STDERR "        size 2: $s2\n";




print STDERR "step 2: associate IDs to source Dumper entries\n";
die "Can't find source.lex file" unless -f "$selfdir/source.lex";

my $source = Lingua::NATools::Lexicon->new("$selfdir/source.lex");
my $new_source_dic_size = $source->size;
my $Oc1source =           $source->occurrences;
my $Oc2source = 0;
$source->enlarge($s1);
my $i = 0;
my $new = 0;
for my $w (keys %$DIC1) {
  ++$i;

  my $wid = ($w eq "(null)" || $w eq "(none)")?1:$source->id_from_word($w);

  # this increments the occurrence number to 1
  unless ($wid) {
    $wid = $source->add_word($w);
    $new++;
  }
  $Oc2source+=$DIC1->{$w}{count};
  $source->set_id_count($wid, $source->id_count($wid) + $DIC1->{$w}{count});
  $DIC1->{$w}{id} = $wid;
}
printf STDERR "        %d ids reused\n", $i - $new;
printf STDERR "        %d new ids\n", $new;
$s1 = $source->size();
$source->save("$selfdir/source.lex");
$source->close();

$new_source_dic_size+= $new;

print STDERR "step 3: associate IDs to target Dumper entries\n";
die "Can't find target.lex file" unless -f "$selfdir/target.lex";

my $target = Lingua::NATools::Lexicon->new("$selfdir/target.lex");
my $new_target_dic_size = $target->size;
my $Oc1target =           $target->occurrences;
my $Oc2target = 0;
$target->enlarge($s2);
$i = 0;
$new = 0;
for my $w (keys %$DIC2) {
  ++$i;

  my $wid = ($w eq "(null)" || $w eq "(none)")?1:$target->id_from_word($w);

  # this increments the occurrence number to 1
  unless ($wid) {
    $wid = $target->add_word($w);
    $new++;
  }
  $Oc2target+=$DIC2->{$w}{count};
  $target->set_id_count($wid, $target->id_count($wid) + $DIC2->{$w}{count});
  $DIC2->{$w}{id} = $wid;
}
printf STDERR "        %d ids reused\n", $i - $new;
printf STDERR "        %d new ids\n", $new;
$s2 = $target->size();
$target->save("$selfdir/target.lex");
$target->close();

$new_target_dic_size+= $new;



###  $Oc2source P(D1,wa,wb) wa1Occ + $oc1source P(D2,wa,wb) wa2occ
### ------------------------------------------------------------------------
###              wa1Occ $oc2source + wa2occ $oc1source
print STDERR "step 4: add source/target dictionary\n";
my $stDic = Lingua::NATools::Dict->open("$selfdir/source-target.bin");
die unless $stDic;

$stDic->enlarge($new_source_dic_size);


$target = Lingua::NATools::Lexicon->new("$selfdir/target.lex");
for my $w (keys %$DIC1) {
  my $wid = $DIC1->{$w}{id};

  my $wa1Occ = $stDic->occ($wid);
  my $wa2Occ = $DIC1->{$w}{count};

  my $ovals = $stDic->vals($wid);
  my %ovals = @$ovals;
  my @keys = keys %ovals;
  push @keys, grep { !exists($ovals{$_}) } map {$DIC2->{$_}{id}} keys %{$DIC1->{$w}{trans}};

  my %dic = ();
  for my $k (@keys) {
    my $P1 = $ovals{$k} || 0;
    my $tw = $target->word_from_id($k);
    my $P2 = $DIC1->{$w}{trans}{$tw} || 0;
    if ($wa1Occ + $wa2Occ == 0.0000000) {
      $dic{$k} = ($Oc2source * $P1 * ($Oc1source/100000) + $Oc1source * $P2 * ($Oc2source/100000)) /
	(($Oc1source/100000) * $Oc2source + ($Oc2source/100000) * $Oc1source);
    } else {
      $dic{$k} = ($Oc2source * $P1 * $wa1Occ + $Oc1source * $P2 * $wa2Occ) /
	($wa1Occ * $Oc2source + $wa2Occ * $Oc1source);
    }
  }

  my $index = 0;
  for my $k (sort {$dic{$b} <=> $dic{$a}} keys %dic) {
    last if $index >= $MAXENTRY;

    $stDic->set_val($wid, $index, $k, $dic{$k});

    $index++;
  }
  $stDic->set_occ($wid, $wa1Occ+$wa2Occ);
}
$stDic->save("$selfdir/source-target.bin");
$stDic->close();

$target->close();




$source = Lingua::NATools::Lexicon->new("$selfdir/source.lex");
print STDERR "step 5: add target/source dictionary\n";
my $tsDic = Lingua::NATools::Dict->open("$selfdir/target-source.bin");
die unless $tsDic;

$tsDic->enlarge($new_target_dic_size);

for my $w (keys %$DIC2) {
  my $wid = $DIC2->{$w}{id};

  my $wa1Occ = $tsDic->occ($wid);
  my $wa2Occ = $DIC2->{$w}{count};

  my $ovals = $tsDic->vals($wid);
  my %ovals = @$ovals;
  my @keys = keys %ovals;
  push @keys, grep { !exists($ovals{$_}) } map {$DIC1->{$_}{id}} keys %{$DIC2->{$w}{trans}};

  my %dic = ();
  for my $k (@keys) {
    my $P1 = $ovals{$k} || 0;
    my $tw = $source->word_from_id($k);
    my $P2 = $DIC2->{$w}{trans}{$tw} || 0;
    if ($wa1Occ + $wa2Occ == 0.0000000) {
      $dic{$k} = ($Oc2target * $P1 * ($Oc1target/100000) + $Oc1target * $P2 * ($Oc2target/100000)) /
	(($Oc1target/100000) * $Oc2target + ($Oc2target/100000) * $Oc1target);
    } else {
      $dic{$k} = ($Oc2target * $P1 * $wa1Occ + $Oc1target * $P2 * $wa2Occ) /
	($wa1Occ * $Oc2target + $wa2Occ * $Oc1target);
    }
  }

  my $index = 0;
  for my $k (sort {$dic{$b} <=> $dic{$a}} keys %dic) {
    last if $index >= $MAXENTRY;

    $tsDic->set_val($wid, $index, $k, $dic{$k});

    $index++;
  }
  $tsDic->set_occ($wid, $wa1Occ+$wa2Occ);

}
$tsDic->save("$selfdir/target-source.bin");
$tsDic->close();
$source->close();

print STDERR "** DONE **\n";

=encoding UTF-8

=head1 NAME

nat-addDict: adds a dictionary in Perl Dumper format into a NATools corpus.

=head1 SYNOPSIS

  nat-addDict <natDir> <dumperFile>

  nat-addDict <natDir> <source-target.dmp> <target-source.dmp>

=head1 DESCRIPTION

This command is used to add an external dictionary (in Perl Dumper
format) to a NATools corpus.

=head1 SEE ALSO

NATools documentation, perl(1)

=head1 AUTHOR

Alberto Manuel Brandão Simões, E<lt>ambs@cpan.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006-2009 by Alberto Manuel Brandão Simões

=cut