The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -s

# NATools - Package with parallel corpora tools
# Copyright (C) 2002-2012  Alberto Simões
#
# This package is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.


use strict;
use warnings;

use POSIX qw(locale_h);
setlocale(LC_CTYPE, "pt_PT");
use locale;


use Lingua::NATools;
use Lingua::NATools::Lexicon;
use Lingua::NATools::Config;
use Cwd;

our ($tmx, $langs, $tokenize, $utf8, $id, $h, $i, $v, $csize);

sub usage {
  print "nat-codify: codifies a corpus. Normally not used manually.\n\n";
  print "\tnat-codify [-v] [-utf8] [-id=ID] [-csize=70000] [-tokenize] [-i] [-langs=L1..L2]\n\t\t <sourceCrp> <targetCrp>\n\n";
  print "\tnat-codify [-v] [-utf8] [-id=ID] [-csize=70000] [-tokenize] [-i] [-langs=L1..L2]\n\t\t -tmx <tmxfile>\n\n";
  print "For more help, please run 'perldoc nat-codify'\n";
  exit;
}

$v||=0;

usage () if ($h);

my @files = @ARGV;
usage() unless @files;

my @langs = ();

if ($tmx) {
    $tmx = $files[0];
    print STDERR "Loading TMX file.\n";
    @files = Lingua::NATools::tmx2files({utf8 => $utf8, verbose=>$v},$tmx);
    die "Error loading TMX file\n" unless @files;

    for (@files) {
        m!\Q$tmx\E-(.+)!;
        push @langs, $1;
    }
    if ($langs && $langs =~ m!([a-z-]+)\.\.([a-z-]+)!i){
        @langs = ($1,$2) ;
        @files = ("$tmx-$1","$tmx-$2");
    }
}

### Define file names
my ($crp1,$crp2) = @files[0..1];
my $defaultname  = $id || "$crp1-$crp2";
my $currentdir   = getcwd;

### Check if we have specific languages defined.
@langs = ($1,$2) if ($langs && $langs =~ m!([a-z-]+)\.\.([a-z-]+)!i);

my $self = Lingua::NATools->init({csize => $csize || undef},
                                 $currentdir, $defaultname, @langs);

$i = 0 if $utf8;
$self->{tokenize} = 1 if $tokenize;
$self->codify({
               ignore_case => $i,
               utf8        => $utf8,
               verbose     => $v
              },
              $crp1, $crp2);
$self->index_invindexes(1);

if ($tmx) {
    unlink for @files
}




__END__

=encoding UTF-8

=head1 NAME

nat-codify - Command line tool to codify corpora

=head1 SYNOPSIS

   nat-codify <file1.nat> <file2.nat>

   nat-codify -tmx <file.tmx>

=head1 DESCRIPTION

The C<-tokenize> flag can be used to force NATools to tokenize the
texts. Note that at the moment a Portuguese tokenizer is used for all
languages. This might change in the future.

The C<-id=name> flag can be used to force NATools Corpora name. By default
the name is read interactively.

The C<-q> flag can be used to force quite mode.
In thic case, the name is extracted from the file-names.

The C<-lang=PT..EN> flag can be used to force languages.

=head1 SEE ALSO

NATools documentation, perl(1), nat-create

=head1 AUTHOR

Alberto Manuel Brandão Simões, E<lt>ambs@cpan.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2002-2012 by Alberto Manuel Brandão Simões

=cut