The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
# -*-perl-*-
# 
# linkclue.pl
#
#---------------------------------------------------------------------------
# Copyright (C) 2004 Jörg Tiedemann  <joerg@stp.ling.uu.se>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#---------------------------------------------------------------------------
# $Id$
#
# usage:  
#

BEGIN{
    use strict;
    use FindBin qw($Bin);
    use lib "$Bin/../lib";
    $ENV{UPLUGHOME}="$Bin/../" unless (defined $ENV{UPLUGHOME});
}

use Uplug::Data::Align;
use Uplug::Data;
# use Uplug::Data::DOM;
use Uplug::IO::Any;
use Uplug::Config;

my %IniData=&GetDefaultIni;
my $IniFile='linkclue.ini';
&CheckParameter(\%IniData,\@ARGV,$IniFile);

#---------------------------------------------------------------------------

my ($InputStreamName,$InputStream)=           # take only 
    each %{$IniData{'input'}};                # the first input stream
my $OutDbm=$IniData{output};

if (not $InputStreamName){die "# LinkClue: no input stream!\n";}
if (ref($InputStream) ne 'HASH'){die "# LinkClue: no input stream!\n";}
if (ref($OutDbm) ne 'HASH'){die "# LinkClue: cannot find output streams";}

#---------------------------------------------------------------------------
# open input data stream

my $input=Uplug::IO::Any->new($InputStream);
if (not $input->open('read',$InputStream)){exit;}
my $header=$input->header;

#---------------------------------------------------------------------------
# open clue dbm's and set feature parameter

my $param=$IniData{parameter};
my %output;
my %feature=();
foreach (keys %{$OutDbm}){
    if (ref($$param{clues}) eq 'HASH'){
	if (not $$param{clues}{$_}){next;}
    }
    if (defined $IniData{parameter}{$_}){
	$feature{$_}=$IniData{parameter}{$_};
    }
    else{
	$feature{$_}=$IniData{parameter}{token};
    }
    if (not defined $feature{$_}{'pair frequency'}){ 
	if (defined $$param{general}{'pair frequency'}){ 
	    $feature{$_}{'pair frequency'}=$$param{general}{'pair frequency'};
	}
    }
    if (not defined $feature{$_}{'min score'}){ 
	if (defined $$param{general}{'min score'}){ 
	    $feature{$_}{'min score'}=$$param{general}{'min score'};
	}
    }
    $output{$_}=Uplug::IO::Any->new($$OutDbm{$_});
    if (not ref($output{$_})){delete $output{$_};next;}
#    print STDERR "new $_\n";
    $output{$_}->addheader($header);
    if (not $output{$_}->open('write',$$OutDbm{$_})){delete $output{$_};}
}
if (not keys %output){die "# LinkClue: no output found!\n";}

#---------------------------------------------------------------------------
# main ....

my $in=Uplug::Data::Align->new;
my $count=0;
my %FeatPair=();
my $PrintProgr=1;

while ($input->read($in)){

    $count++;
    if ($PrintProgr){
	if (not ($count % 500)){$|=1;print STDERR "$count segments\n";$|=0;}
	if (not ($count % 50)){$|=1;print STDERR '.';$|=0;}
    }

#    my $link=$in->{link};
    my $link=$in->linkData();
    my @ids=$link->findNodes('wordLink');

    foreach my $n (0..$#ids){

	my $xtrg=$link->attribute($ids[$n],'xtargets');
	my ($srcID,$trgID)=split(/\;/,$xtrg);

#	$in->subData($in->{source},'source');
	my @srcPhrNodes=&GetPhraseNodes($in->{source},$srcID);
#	$in->subData($in->{target},'target');
	my @trgPhrNodes=&GetPhraseNodes($in->{target},$trgID);

	foreach my $o (keys %output){

	    my $SrcFeat=$in->getSrcPhraseFeature(\@srcPhrNodes,$feature{$o});
	    my $TrgFeat=$in->getTrgPhraseFeature(\@trgPhrNodes,$feature{$o});

	    if (defined $feature{$o}{'relative position'}){
		my $pos=$in->getRelativePosition(\@srcPhrNodes,\@trgPhrNodes);
		if ($TrgFeat=~/\S/){
		    $TrgFeat.=":$pos";
		}
		else{$TrgFeat=$pos;}
		if ($SrcFeat=~/\S/){
		    $SrcFeat.=":x";
		}
		else{$SrcFeat='x';}
	    }

	    $FeatPair{$o}{$SrcFeat}{$TrgFeat}++;
	    $FeatPair{$o}{$SrcFeat}{'__all'}++;
	    $FeatPair{$o}{$TrgFeat}{'__all'}++;
	}
    }
}

$input->close;

foreach my $o (keys %output){
    &SaveFeatDice($FeatPair{$o},$output{$o},$feature{$o});
    $output{$o}->close;
}


#---------------------------------------------------------------------------


sub AddRelPosFeature{
    my ($data,$src,$trg)=@_;
    if (ref($src) and ref($trg)){
	my $srcID=$data->attribute($src,'id');
	my $trgID=$data->attribute($trg,'id');
	if ($srcID=~/(\A|[^0-9])([0-9]+)$/){
	    my $pos=$2;
	    if ($trgID=~/(\A|[^0-9])([0-9]+)$/){
		return $2-$pos;
	    }
	}
    }
    return 0;
}

sub GetPhraseNodes{
    my $data=shift;
    my $idStr=shift;
    my @ids=split(/[\:\+]/,$idStr);
    my @nodes=();
    foreach (@ids){
	my ($node)=$data->findNodes('.*',{id => $_});
	if (ref($node)){
	    push (@nodes,$node);
	}
    }
    if ((not @nodes) and (@ids)){        # in case the nodes haven't been found
	my @n=$data->contentElements();  # (e.g. no IDs in the XML-file)
	foreach (@ids){
	    push (@nodes,$n[$_]);
	}
    }
    return @nodes;
}


sub SaveCondProb{
    my ($pairs,$out,$param)=@_;
    my $minFreq=$param->{'pair frequency'};
    my $minScore=$param->{'min score'};

    &WriteFeatureHeader($out,$param);

    foreach my $s (keys %{$pairs}){
	foreach my $t (keys %{$pairs->{$s}}){
	    if ($t eq '__all'){next;}
	    if ($FeatPair{$s}{'__all'}<$minFreq){next;}
	    $FeatPair{$s}{$t}/=$FeatPair{$s}{'__all'};
	    if ($FeatPair{$s}{$t}<$minScore){next;}
	    my $data=Uplug::Data->new;
	    $data->setAttribute('source',$s);
	    $data->setAttribute('target',$t);
	    $data->setAttribute('score',$pairs->{$s}->{$t});
	    $out->write($data);
	}
    }
}

sub SaveFeatDice{
    my ($pairs,$out,$param)=@_;
    my $minFreq=$param->{'pair frequency'};
    my $minScore=$param->{'min score'};

    &WriteFeatureHeader($out,$param);

    foreach my $s (keys %{$pairs}){
	foreach my $t (keys %{$pairs->{$s}}){
	    if ($t eq '__all'){next;}
	    if ($$pairs{$s}{'__all'}<$minFreq){next;}
	    if ($$pairs{$t}{'__all'}<$minFreq){next;}
	    my $score=2*$$pairs{$s}{$t}/
		($$pairs{$s}{'__all'}+$$pairs{$t}{'__all'});
	    if ($score<$minScore){next;}
	    my $data=Uplug::Data->new;
	    $data->setAttribute('source',$s);
	    $data->setAttribute('target',$t);
	    $data->setAttribute('score',$score);
	    $out->write($data);
	}
    }
}

sub WriteFeatureHeader{
    my ($out,$param)=@_;
    $out->addheader($param);
    $out->writeheader;
}


sub GetSrcFeature{
    return &GetFeature(@_,'source');
}
sub GetTrgFeature{
    return &GetFeature(@_,'target');
}



sub GetFeature{
    my ($data,$id,$param,$subtree)=@_;

    if (defined $$param{parameter}{"$subtree feature"}{attribute}){
	my $attr=$$param{parameter}{"$subtree feature"}{attribute};
	return &GetAttrFeature($data,$id,$attr);
    }
    if (defined $$param{parameter}{"$subtree feature"}{'relative position'}){
	return &GetRelPos($data,$id,$subtree);
    }
}


sub GetAttrFeature{
    my ($data,$id,$attr)=@_;

    my @features=();
    my @ids=split(/\:/,$id);

    foreach my $i (@ids){
	push (@features,$data->attribute($attr,'.*',{id => $i}));
    }
    return join ' ',@features;
}


my $SrcPos;

sub GetRelPos{
    my ($data,$id,$subtree)=@_;

    my @features=();
    my @ids=split(/\:/,$id);

    foreach my $i (@ids){
	my $pos=-1;
	if ($i=~/(\A|[^0-9])([0-9]+)$/){
	    $pos=$2;
	}
	if ($subtree eq 'source'){
	    $SrcPos=$pos;
	}
	else{
	    $pos-=$SrcPos;
	    push (@features,$pos);
	}
    }
    if ($subtree eq 'source'){return 'x';}
    return join ' ',@features;
}


sub GetDefaultIni{

    my $DefaultIni = {
  'input' => {
    'text' => {
      'format' => 'align',
      'write_mode' => 'write',
      'file' => 'data/align.xml',
    }
  },
  'output' => {
    'feature' => {
      'format' => 'DBM',
      'write_mode' => 'overwrite',
      'file' => 'data/pos.dbm',
      'key' => ['source','target']
    },
  },
  'parameter' => {
    'token' => {
      'relative position' => 1,
       'features (source)' => {
           'pos' => undef,
       },
       'features (target)' => {
           'pos' => undef,
       },
    },
    'general' => {
      'pair frequency' => 4,
#       'min score' => 0.2,
    },
  },
  'arguments' => {
    'shortcuts' => {
       'in' => 'input:text:format',
       'infile' => 'input:text:file',
       'informat' => 'input:text:format',
       'out' => 'output:feature:file',
    }
  },
};
    return %{$DefaultIni};
}