The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
#----------------------------------------------------------------------------
# -*-perl-*-
#
# Copyright (C) 2004 Jörg Tiedemann  <joerg@stp.ling.uu.se>
#
# $Id: giza2uplug,v 1.1.1.1 2004/05/03 15:39:55 joerg72 Exp $
#----------------------------------------------------------------------------
#
# 

use strict;
use FindBin qw($Bin);
use lib "$Bin/..";

use Uplug::Data::Align;
use Uplug::Data;
use Uplug::IO::Any;

my %InputStream=('format' => 'xces xml');
$InputStream{file}=shift(@ARGV);
my %OutputStream=('format' => 'xces xml',
		  'SkipSrcFile' => 1,
		  'SkipTrgFile' => 1);

my $giza=shift(@ARGV);

my $TokenLabel='w';

#---------------------------------------------------------------------------

my $input=Uplug::IO::Any->new(\%InputStream);
my $output=Uplug::IO::Any->new(\%OutputStream);
$input->open('read',\%InputStream);
$output->addheader($input->header());
$output->open('write',\%OutputStream);

open F,"<$giza";

#---------------------------------------------------------------------------

my $data=Uplug::Data::Align->new();

while ($input->read($data)){

    my $srctxt=$data->content('source');
    my $trgtxt=$data->content('target');
    $srctxt=~tr/\n/ /;
    $trgtxt=~tr/\n/ /;
    if ($srctxt!~/\S/){next;}
    if ($trgtxt!~/\S/){next;}

    my $SrcData=Uplug::Data->new();
    my $TrgData=Uplug::Data->new();
    $data->subTree($SrcData,'source');
    $data->subTree($TrgData,'target');

    my @SrcNodes=$SrcData->getNodes($TokenLabel);
    my @SrcIds=$data->attribute(\@SrcNodes,'id');
    my @SrcSpans=$data->attribute(\@SrcNodes,'span');
    my @SrcTokens=$data->content(\@SrcNodes);

    my @TrgNodes=$TrgData->getNodes($TokenLabel);
    my @TrgIds=$data->attribute(\@TrgNodes,'id');
    my @TrgSpans=$data->attribute(\@TrgNodes,'span');
    my @TrgTokens=$data->content(\@TrgNodes);

    if ((not @SrcNodes) or (not @TrgNodes)){next;}

    $_=<F>;
    $_=<F>;
    chomp;
    my @src=split(/ /);
    $_=<F>;
    chomp;


    my %align=();
    my $count=1;
    my $ggg=$_;
    while (/\s(\S.*?)\s\(\{\s(.*?)\}\)/g){     # strunta i NULL!!
	if ($2){push (@{$align{$2}},$count);}
	$count++;
    }
    foreach (sort keys %align){
	my @s=@{$align{$_}};
	my @t=split(/\s/);

	my @src=();my @trg=();
	foreach (@s){push (@src,$SrcTokens[$_-1]);}
	foreach (@t){push (@trg,$TrgTokens[$_-1]);}
	my @srcId=();my @trgId=();
	foreach (@s){push (@srcId,$SrcIds[$_-1]);}
	foreach (@t){push (@trgId,$TrgIds[$_-1]);}
	my @srcSpan=();my @trgSpan=();
	foreach (@s){push (@srcSpan,$SrcSpans[$_-1]);}
	foreach (@t){push (@trgSpan,$TrgSpans[$_-1]);}

	my %link=();
	$link{link}=join ' ',@src;
	$link{link}.=';';
	$link{link}.=join ' ',@trg;
	$link{source}=join '+',@srcId;
	$link{target}=join '+',@trgId;
	$link{src}=join '&',@srcSpan;
	$link{trg}=join '&',@trgSpan;

	if ($link{source}!~/\S/){
	    print '';
	}

	$data->addWordLink(\%link);
    }

    $output->write($data);
}

$input->close;
$output->close;