The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
#-*-perl-*-
#
#

use FindBin;
use lib $FindBin::Bin.'/../lib';
use Lingua::Align::Corpus::Parallel::OPUS;


my $AlignFile = shift(@ARGV);
my $SrcTrees = shift(@ARGV);
my $SrcFormat = shift(@ARGV);

my $TrgTrees = shift(@ARGV);
my $TrgFormat = shift(@ARGV);

my $SrcOut = shift(@ARGV);
my $TrgOut = shift(@ARGV);

my $corpus = new Lingua::Align::Corpus::Parallel::OPUS(
	      -alignfile => $AlignFile,
	      -type => 'opus',
	      -src_file => $SrcTrees,
	      -src_type => $SrcFormat,
	      -trg_file => $TrgTrees,
	      -trg_type => $TrgFormat);

open S,">$SrcOut" || die "cannot write to $SrcOut!\n";
open T,">$TrgOut" || die "cannot write to $TrgOut!\n";

my %src=();
my %trg=();
my $links;

while ($corpus->next_alignment(\%src,\%trg,\$links)){
    my @srcwords=$corpus->{SRC}->get_all_leafs(\%src);
    my @trgwords=$corpus->{TRG}->get_all_leafs(\%trg);
    if (@srcwords && @trgwords){
	print S join(' ',@srcwords);
	print S "\n";
	print T join(' ',@trgwords);
	print T "\n";
    }
}

close S;
close T;


__END__

=head1 NAME

treebank2moses - convert treebanks to Moses/GIZA++ format (plain text)

=head1 SYNOPSIS

    treebank2moses alignfile STrees SFormat TTrees TFormat SOut TOut

=head1 DESCRIPTION


=head1 SEE ALSO

L<Lingua::Align::Corpus>
 

=head1 AUTHOR

Joerg Tiedemann

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2009 by Joerg Tiedemann

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.8 or,
at your option, any later version of Perl 5 you may have available.


=cut