The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
#-*-perl-*-
#
# Copyright (C) 2004 Jörg Tiedemann  <joerg@stp.ling.uu.se>
#
# $Id: xces2poslink,v 1.1 2007/05/08 08:52:36 joerg72 Exp $
#----------------------------------------------------------------------------
# extract features from word alignments (default: text:POS)
# USAGE: xces2poslink wordalign-file
# 

use strict;
use FindBin qw($Bin);
use lib "$Bin/..";

use Uplug::Data::Align;
use Uplug::IO::Any;

my %InputStream=('format' => 'xces align');
$InputStream{file}=shift(@ARGV);

#---------------------------------------------------------------------------

my $input=Uplug::IO::Any->new(\%InputStream);
$input->open('read',\%InputStream);


## specify your features here:
my %SrcParam=('features' => {'#text' => undef,     # text 
			     'pos' => undef});     # pos attribute
my %TrgParam=('features' => {'#text' => undef,     # the same for the
			     'pos' => undef});     # target language

#---------------------------------------------------------------------------


my $data=Uplug::Data::Align->new();

my $count=0;
while ($input->read($data)){
    $count++;
    if (not ($count % 100)){
	$|=1;print STDERR '.';$|=0;
    }
    if (not ($count % 1000)){
	$|=1;print STDERR "$count\n";$|=0;
    }

    ## 'link' subtree in XML
    my $link=$data->linkData();
    ## all word alignment nodes (wordLink tags)
    my @nodes=$link->findNodes('wordLink');


    foreach my $n (0..$#nodes){

	## get xtarget attribute
	my $xtrg=$link->attribute($ids[$n],'xtargets');
	my ($srcID,$trgID)=split(/\;/,$xtrg);

	## get word nodes (source and target) corresponding to the current link
	my @srcPhrNodes=&GetPhraseNodes($data->{source},$srcID);
	my @trgPhrNodes=&GetPhraseNodes($data->{target},$trgID);

	## get features according to features specifications
	my $SrcFeat=$data->getSrcPhraseFeature(\@srcPhrNodes,\%SrcParam);
	my $TrgFeat=$data->getTrgPhraseFeature(\@trgPhrNodes,\%TrgParam);

	## print source and target features
	print $SrcFeat,"\t",$TrgFeat,"\n";
    }
}
$input->close;







sub GetPhraseNodes{
    my $data=shift;
    my $idStr=shift;
    my @ids=split(/[\:\+]/,$idStr);
    my @nodes=();
    foreach (@ids){
	my ($node)=$data->findNodes('.*',{id => $_});
	if (ref($node)){
	    push (@nodes,$node);
	}
    }
    if ((not @nodes) and (@ids)){        # in case the nodes haven't been found
	my @n=$data->contentElements();  # (e.g. no IDs in the XML-file)
	foreach (@ids){
	    push (@nodes,$n[$_]);
	}
    }
    return @nodes;
}