The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -s

use strict;
use warnings;
use File::Spec::Functions;
use Lingua::NATools::ConfigData;

no warnings 'once';
my $prefix = Lingua::NATools::ConfigData->config('libdir');

# Hope they maintain this...
my $rules_prefix = catfile $prefix => "NATools";

our ($id, $h, $tokenize, $full, $i, $ngrams, $terminology, $utf8, $csize, $langs);

$terminology = 1 && $ngrams = 1 if $full;

sub usage {
    print "nat-mkMakefile: generates a pmakefile for Makefile::Parallel use.\n\n";
    print "\tnat-mkMakefile -i -utf8 -tokenize -csize=70000 -id=<ID> <sourceCrp> <targetCrp> > pmakefile\n\n";
    print "\tnat-mkMakefile -i -utf8 -tokenize -csize=70000 -id=<ID> <tmxfile> > pmakefile\n\n";
    print "For more help, please run 'perldoc nat-mkMakefile'\n";
    exit;
}

usage() if ($h);

$i = 1 if $utf8;

my $home=$ENV{HOME};

$id ||= "ALIGNED";
my $source = "";

if ($#ARGV == 0 && $ARGV[0] =~ m!tmxt?$!) {
    $source = "-tmx $ARGV[0]";
} elsif ($#ARGV == 1) {
    $source = join(" ",@ARGV);
} else {
    usage();
}

$source = "-langs=$langs $source" if $langs;

my $makefile;
$makefile .= $_ while <DATA>;
$makefile .= "\n"._examples() if $terminology;
$makefile .= "\n"._ngrams()   if $ngrams;
$makefile .= _code()          if $ngrams;

$tokenize = $tokenize?"-tokenize":"";

$i     = $i     ? "-i"           : "";
$utf8  = $utf8  ? "-utf8"        : "";
$csize = $csize ? "-csize=$csize": "";

for ($makefile) {
    s/<=ID=>/$id/g;
    s/<=SOURCE=>/$source/g;
    s/<=HOME=>/$home/g;
    s/<=TOKENIZE=>/$tokenize/g;
    s/<=I=>/$i/g;
    s/<=UTF8=>/$utf8/g;
    s/<=CSIZE=>/$csize/g;
}
print $makefile;


sub _examples {
    return <<'EO';
examples$i: dicA dicB (20:00:00)
	<=HOME=>/NATools/NAT/scripts/nat-examplesExtractor -chunk=$i -local=$ID -langs=pt..en

cleanExamples: examples$i (20:00:00)
	rm -f $ID/patterns.txt $ID/examples.txt
	for a in @i; do cat $ID/examples.${a}.txt | grep '=!' >> $ID/patterns.txt; done
	for a in @i; do cat $ID/examples.${a}.txt | grep -v '=!' >> $ID/examples.txt; done
	for a in @i; do rm -f $ID/examples.${a}.txt; done
EO
}

sub _ngrams {
    return <<'EO';
ngramS$i.$j: codify (20:00:00)
	nat-ngrams -n $j $ID/source.$i.crp $ID/ngramS.$i.$j.db

ngramT$i.$j: codify (20:00:00)
	nat-ngrams -n $j $ID/target.$i.crp $ID/ngramT.$i.$j.db

joinGramS$j: ngramS$i.$j (20:00:00)
	nat-ngrams -j $ID/ngramS$j $ID/ngramS*.$j.db
	rm -f $ID/ngramS*.$j.db

joinGramT$j: ngramT$i.$j (20:00:00)
	nat-ngrams -j $ID/ngramT$j $ID/ngramT*.$j.db
	rm -f $ID/ngramT*.$j.db

gramSQLiteS$j: joinGramS$j (20:00:00)
	sub{ create_sqlite('S', $j); }

gramSQLiteT$j: joinGramT$j (20:00:00)
	sub{ create_sqlite('T', $j); }
EO
}

sub _code {
return <<'EO';

%%

sub create_sqlite {
	my ($t,$n) = @_;
	
	my $lex = ($t eq 'S')?"source":"target";
	
	`nat-ngrams -o 2 -d $ID/ngram$t$n $ID/$lex.lex > $ID/ngram$t$n.txt`;
	`rm -f $ID/ngram$t$n`;
	`sort -n -r $ID/ngram$t$n.txt > $ID/_$t$n`;
	`rm -f $ID/ngram$t$n.txt`;
	open R, "$ID/_$t$n";
	open W, ">$ID/__$t$n";
	while(<R>) {
		chomp;
		my @F = split /\s/, $_;
		push @F, shift @F;
		print W "@F\n";
	}
	close W;
	close R;
	`rm -f $ID/_$t$n`;
	
	my @v=(undef, undef, qw/bigrams trigrams tetragrams/);
	open SQL, "|sqlite3 $ID/$t.$n.sqlite";
	my $fields = join(",",map{"w$_"}(1..$n));
	print SQL "CREATE TABLE $v[$n] ($fields,occs);\n";
	print SQL ".separator ' '\n";
	print SQL ".import $ID/__$t$n $v[$n]\n";
	for my $i (1..$n) {
		print SQL "CREATE INDEX idx${t}${n}w${i} ON $v[$n] (w$i);"
	}
	close SQL;
	`rm -f $ID/__$t$n`;
	

}
EO
}


=encoding UTF-8

=head1 NAME

nat-mkMakefile - generates a pmakefile to be used by Makefile::Parallel

=head1 SYNOPSIS

  nat-mkMakefile -id=<ID> <sourceCrp> <targetCrp>   >   pmakefile
  nat-mkMakefile -id=<ID> <tmxfile>                 >   pmakefile

=head1 DESCRIPTION

This script generates a parallel makefile to be used by
Makefile::Parallel to align and extract examples using a PBS based
cluster.

The C<-id> switch is required and should contain the identifier of the
corpus to be created.

=head1 OPTIONS

=over 4

=item C<-full>

Creates the full makefile, including n-grams and terminology
extraction.

=item C<-ngrams>

Creates the makefile including n-grams computation.

=item C<-terminology>

Creates the makefile including terminology extraction.

=back

=head1 SEE ALSO

NATools documentation, Makefile::Parallel, perl(1)

=head1 AUTHOR

Alberto Manuel Brandão Simões, E<lt>ambs@cpan.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006-2012 by Alberto Manuel Brandão Simões

=cut

__DATA__
# -*- makefile -*-
ID=<=ID=>

codify: (20:00:00)
	nat-codify -v <=UTF8=> <=I=> -id=$ID <=CSIZE=> <=TOKENIZE=> <=SOURCE=>
	i <- sub{ $nr = `cat <=ID=>/nat.cnf |grep nr-chunks|cut -f 2 -d "="`; printf("%03d\n",$_) for (1..$nr); }
	j <- sub{ printf("%d\n", $_) for (2..4) }

initmat$i: codify  (20:00:00)
	nat-initmat $ID/source.$i.crp $ID/target.$i.crp $ID/mat.$i.in

ipfp$i: initmat$i (20:00:00)
	nat-ipfp 5 $ID/source.$i.crp $ID/target.$i.crp $ID/mat.$i.in $ID/mat.$i.out
	rm -f $ID/mat.$i.in

postipfp$i: ipfp$i (20:00:00)
	nat-mat2dic $ID/mat.$i.out $ID/dict.$i
	rm -f $ID/mat.$i.out

postbin$i: postipfp$i (20:00:00)
	nat-postbin $ID/dict.$i $ID/source.$i.crp.partials $ID/target.$i.crp.partials $ID/source.lex $ID/target.lex $ID/source-target.$i.bin $ID/target-source.$i.bin
	rm -f $ID/dict.$i

dicA: postbin$i (20:00:00)
	for a in @i; do nat-dict add $ID/source-target.bin $ID/source-target.${a}.bin; done
	for a in @i; do rm -f $ID/source-target.${a}.bin; done

dicB: postbin$i (20:00:00)
	for a in @i; do nat-dict add $ID/target-source.bin $ID/target-source.${a}.bin; done
	for a in @i; do rm -f $ID/target-source.${a}.bin; done

dump: dicA dicB (20:00:00)
	nat-dumpDicts <=UTF8=> -self $ID