####################################################################
# Copyright (C) 2004 Jörg Tiedemann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# $Author$
# $Id$
#
###########################################################################
# Uplug::Data::Align
#
#
#
###########################################################################
package Uplug::Data::Align;
use strict;
use vars qw( @ISA );
use Uplug::Data;
use Uplug::Data::Lang;
@ISA = qw( Uplug::Data );
sub init{
my $self=shift;
my $srclang=shift;
my $trglang=shift;
if (not defined $self->{source}){
$self->{source}=Uplug::Data::Lang->new($srclang);
$self->setOption('SRCLANG',$srclang);
}
elsif ((defined $srclang) and ($self->{SRCLANG} ne $srclang)){
$self->{source}=Uplug::Data::Lang->init($srclang);
$self->setOption('SRCLANG',$srclang);
}
else{$self->{source}->init();}
if (not defined $self->{target}){
$self->{target}=Uplug::Data::Lang->new($trglang);
$self->setOption('TRGLANG',$trglang);
}
elsif ((defined $trglang) and ($self->{TRGLANG} ne $trglang)){
$self->{target}=Uplug::Data::Lang->new($trglang);
$self->setOption('TRGLANG',$trglang);
}
else{$self->{target}->init();}
# $self->{SRCSUBNEW}=1; # flag for creating new source sub-trees
# $self->{TRGSUBNEW}=1; # flag for creating new target sub-trees
return $self->SUPER::init(@_);
}
sub clone{return Uplug::Data::Align->new();}
sub makeLangSubData{
my $self=shift;
$self->subData($self->{'source'},'source');
$self->subData($self->{'target'},'target');
}
sub sourceData{return $_[0]->{source};}
sub targetData{return $_[0]->{target};}
sub linkData{return $_[0]->{link};}
sub getTokens{
my $self=shift;
my $lang=shift;
my $param=shift;
if (not defined $lang){$lang='source'};
if (not ref($self->{$lang})){return undef;}
# $self->subData($self->{$lang},$lang);
# $self->{$lang}=$self->subData($lang);
$self->makeParameter($param,$lang);
return $self->{$lang}->getTokens($param,@_);
}
sub getSrcTokens{my $self=shift;return $self->getTokens('source',@_);}
sub getTrgTokens{my $self=shift;return $self->getTokens('target',@_);}
sub getNgrams{
my $self=shift;
my $lang=shift;
my $param=shift;
if (not defined $lang){$lang='source'};
if (not ref($self->{$lang})){return undef;}
# $self->subData($self->{$lang},$lang);
# $self->{$lang}=$self->subData($lang);
$self->makeParameter($param,$lang);
return $self->{$lang}->getNgrams($param,@_);
}
sub getSrcNgrams{my $self=shift;return $self->getNgrams('source',@_);}
sub getTrgNgrams{my $self=shift;return $self->getNgrams('target',@_);}
sub getChunks{
my $self=shift;
my $lang=shift;
my $param=shift;
if (not defined $lang){$lang='source'};
if (not ref($self->{$lang})){return undef;}
# $self->subData($self->{$lang},$lang);
# $self->{$lang}=$self->subData($lang);
$self->makeParameter($param,$lang);
return $self->{$lang}->getChunks($param,@_);
}
sub getSrcChunks{my $self=shift;return $self->getChunks('source',@_);}
sub getTrgChunks{my $self=shift;return $self->getChunks('target',@_);}
sub getPhrases{
my $self=shift;
my $lang=shift;
my $param=shift;
if (not defined $lang){$lang='source'};
if (not ref($self->{$lang})){return undef;}
# $self->subData($self->{$lang},$lang);
# $self->{$lang}=$self->subData($lang);
$self->makeParameter($param,$lang);
return $self->{$lang}->getPhrases($param,@_);
}
sub getSrcPhrases{my $self=shift;return $self->getPhrases('source',@_);}
sub getTrgPhrases{my $self=shift;return $self->getPhrases('target',@_);}
sub getPhrasePos{
my $self=shift;
my ($phraseNodes,$tokenNodes)=@_;
my @idx=();
foreach my $p (0..$#{$phraseNodes}){
my $lastIdx=0;
foreach my $t ($lastIdx..$#{$tokenNodes}){
if ($$phraseNodes[$p]==$$tokenNodes[$t]){
push(@idx,$t);
$lastIdx=$t+1;
}
}
}
return join ":",@idx;
}
sub getRelativePosition{
my $self=shift;
my ($srcPhr,$trgPhr)=@_;
my $srcPos=$self->{source}->getPhrasePosition($srcPhr);
my $trgPos=$self->{target}->getPhrasePosition($trgPhr);
if (not defined $srcPos){return 0;}
if (not defined $trgPos){return 0;}
return $trgPos-$srcPos;
}
sub getFeaturePairs{
my $self=shift;
return getAlignPhrases(@_);
}
sub getAlignPhrases{
my $self=shift;
my ($param,$src,$trg,$token,$attr)=@_;
if (ref($param) ne 'HASH'){$param={};}
#----------------------------------------------------------------------
my @srcTokNodes=(); # 1) get all tokens
my @trgTokNodes=();
my @srcTok=$self->getSrcTokens($param,\@srcTokNodes);
my @trgTok=$self->getTrgTokens($param,\@trgTokNodes);
#----------------------------------------------------------------------
my $srcNodes=[]; # 2) get all possible phrases
my $trgNodes=[];
my @srcPhr=$self->getSrcPhrases($param,$srcNodes,
\@srcTokNodes,\@srcTok);
my @trgPhr=$self->getTrgPhrases($param,$trgNodes,
\@trgTokNodes,\@trgTok);
#----------------------------------------------------------------------
my @srcIdx=(); # 3) get token positions for each phrase
my @trgIdx=();
foreach (0..$#srcPhr){
push (@srcIdx,$self->getPhrasePos($$srcNodes[$_],\@srcTokNodes));
}
foreach (0..$#trgPhr){
push (@trgIdx,$self->getPhrasePos($$trgNodes[$_],\@trgTokNodes));
}
#----------------------------------------------------------------------
$self->makeParameter($param,'source'); # get source feature
foreach (0..$#{$srcNodes}){
$$src{$srcIdx[$_]}=
$self->{source}->getPhraseFeature(\@{$$srcNodes[$_]},
$param);
if (defined $$param{'relative position'}){
my $srcPos=$self->{source}->getPhrasePosition($$srcNodes[$_]);
if ($$src{$srcIdx[$_]}=~/\S/){
$$src{$srcIdx[$_]}.=":pos($srcPos)";
}
else{
$$src{$srcIdx[$_]}="pos($srcPos)";
}
}
}
$self->makeParameter($param,'target'); # and generate feature
foreach (0..$#{$trgNodes}){
$$trg{$trgIdx[$_]}=
$self->{target}->getPhraseFeature(\@{$$trgNodes[$_]},
$param);
if (defined $$param{'relative position'}){
my $trgPos=$self->{target}->getPhrasePosition($$trgNodes[$_]);
if ($$trg{$trgIdx[$_]}=~/\S/){
$$trg{$trgIdx[$_]}.=":pos($trgPos)";
}
else{
$$trg{$trgIdx[$_]}="pos($trgPos)";
}
}
}
if (ref($token) eq 'HASH'){
@{$$token{source}}=@srcTok;
@{$$token{target}}=@trgTok;
if (ref($attr) eq 'HASH'){
@{$$attr{source}}=$self->{source}->attribute(\@srcTokNodes);
@{$$attr{target}}=$self->{target}->attribute(\@trgTokNodes);
foreach (0..$#srcTokNodes){
$$attr{source}[$_]{content}=$self->content($srcTokNodes[$_]);
}
foreach (0..$#trgTokNodes){
$$attr{target}[$_]{content}=$self->content($trgTokNodes[$_]);
}
}
}
}
sub getSrcTokenFeatures{
my $self=shift;
return $self->getTokenFeatures('source',@_);
}
sub getTrgTokenFeatures{
my $self=shift;
return $self->getTokenFeatures('target',@_);
}
sub getTokenFeatures{
my $self=shift;
my $lang=shift; # source / target
my ($param,$nodes)=@_;
if (ref($param) ne 'HASH'){$param={};}
if (ref($nodes) ne 'ARRAY'){$nodes=[];}
if (not ref($self->{$lang})){return undef;}
#----------------------------------------------------------------------
$self->makeParameter($param,$lang);
my @tok=$self->{$lang}->getTokens($param,$nodes);
if (keys %{$param}){
foreach (0..$#{$nodes}){
$tok[$_]=$self->{$lang}->getPhraseFeature([$$nodes[$_]],$param);
}
}
return @tok;
}
sub getBitextPhrases{
my $self=shift;
my ($param,$src,$trg,$token,$attr)=@_;
if (ref($param) ne 'HASH'){$param={};}
#----------------------------------------------------------------------
my @srcTokNodes=(); # 1) get all tokens
my @trgTokNodes=();
my @srcTok=$self->getSrcTokens($$param{general},\@srcTokNodes);
my @trgTok=$self->getTrgTokens($$param{general},\@trgTokNodes);
#----------------------------------------------------------------------
my $srcNodes=[]; # 2) get all possible phrases
my $trgNodes=[];
my @srcPhr=$self->getSrcPhrases($$param{general},$srcNodes,
\@srcTokNodes,\@srcTok);
my @trgPhr=$self->getTrgPhrases($$param{general},$trgNodes,
\@trgTokNodes,\@trgTok);
#----------------------------------------------------------------------
my @srcIdx=(); # 3) get token positions for each phrase
my @trgIdx=();
foreach (0..$#srcPhr){
push (@srcIdx,$self->getPhrasePos($$srcNodes[$_],\@srcTokNodes));
}
foreach (0..$#trgPhr){
push (@trgIdx,$self->getPhrasePos($$trgNodes[$_],\@trgTokNodes));
}
#----------------------------------------------------------------------
foreach my $p (keys %{$param}){ # 4) generate phrase features
if ($p eq 'general'){ # a) general = phrase string
foreach (0..$#srcPhr){
$$src{$srcIdx[$_]}{$p}=$srcPhr[$_]; # the source phrase
}
foreach (0..$#trgPhr){
$$trg{$trgIdx[$_]}{$p}=$trgPhr[$_]; # the target phrase
}
next;
}
my $srcParam=$$param{$p}; # b) feature parameter
$self->makeParameter($srcParam,'source'); # get source feature
foreach (0..$#{$srcNodes}){
$$src{$srcIdx[$_]}{$p}=
$self->{source}->getPhraseFeature(\@{$$srcNodes[$_]},
$srcParam);
if ((ref($$param{$p}) eq 'HASH') and
(defined $$param{$p}{'relative position'})){
my $srcPos=$self->{source}->getPhrasePosition($$srcNodes[$_]);
if ($$src{$srcIdx[$_]}{$p}=~/\S/){
$$src{$srcIdx[$_]}{$p}.=":pos($srcPos)";
}
else{
$$src{$srcIdx[$_]}{$p}="pos($srcPos)";
}
}
}
my $trgParam=$$param{$p}; # get target features
$self->makeParameter($trgParam,'target'); # and generate feature
foreach (0..$#{$trgNodes}){
$$trg{$trgIdx[$_]}{$p}=
$self->{target}->getPhraseFeature(\@{$$trgNodes[$_]},
$trgParam);
if ((ref($$param{$p}) eq 'HASH') and
(defined $$param{$p}{'relative position'})){
my $trgPos=$self->{target}->getPhrasePosition($$trgNodes[$_]);
if ($$trg{$trgIdx[$_]}{$p}=~/\S/){
$$trg{$trgIdx[$_]}{$p}.=":pos($trgPos)";
}
else{
$$trg{$trgIdx[$_]}{$p}="pos($trgPos)";
}
}
}
}
if (ref($token) eq 'HASH'){
@{$$token{source}}=@srcTok;
@{$$token{target}}=@trgTok;
if (ref($attr) eq 'HASH'){
@{$$attr{source}}=$self->{source}->attribute(\@srcTokNodes);
@{$$attr{target}}=$self->{target}->attribute(\@trgTokNodes);
foreach (0..$#srcTokNodes){
$$attr{source}[$_]{content}=$self->content($srcTokNodes[$_]);
}
foreach (0..$#trgTokNodes){
$$attr{target}[$_]{content}=$self->content($trgTokNodes[$_]);
}
}
}
}
sub getPhraseFeature{
my $self=shift;
my $lang=shift;
my $nodes=shift;
my $param=shift;
if (not defined $lang){$lang='source'};
if (not ref($self->{$lang})){return undef;}
# $self->subData($self->{$lang},$lang);
# $self->{$lang}=$self->subData($lang);
$self->makeParameter($param,$lang);
return $self->{$lang}->getPhraseFeature($nodes,$param,@_);
}
sub getSrcPhraseFeature{my $s=shift;return $s->getPhraseFeature('source',@_);}
sub getTrgPhraseFeature{my $s=shift;return $s->getPhraseFeature('target',@_);}
sub checkPairParameter{
my $self=shift;
my ($src,$trg,$param)=@_;
if ($$param{'minimal length (source)'}){
if (length($src)<$$param{'minimal length (source)'}){
# print STDERR "minimale length (source)\n";
return 0;
}
}
if ($$param{'minimal length (target)'}){
if (length($trg)<$$param{'minimal length (target)'}){
# print STDERR "minimale length (target)\n";
return 0;
}
}
if ($$param{'minimal length diff'}){
if ($self->lengthQuotient($src,$trg)<$$param{'minimal length diff'}){
# print STDERR "minimale length diff\n";
return 0;
}
}
if ($$param{'matching word class'}){
if (not $self->isSameType($src,$trg,$$param{'matching word class'})){
# print STDERR "matching word class\n";
return 0;
}
}
if ($$param{'stop words'}){
if (not $self->isSameType($src,$trg,$$param{'stop words'})){
# print STDERR "stop words\n";
return 0;
}
}
return 1;
}
sub isSameType{
my $self=shift;
my ($src,$trg,$check)=@_;
if (($check eq 'open/closed') or ($check eq 'same')){
if ($self->{source}->isStopWord($src)){
return $self->{target}->isStopWord($trg);
}
return (not $self->{target}->isStopWord($trg));
}
elsif ($check eq 'exclude'){
if (not $self->{source}->isStopWord($src)){
return 1;
}
return (not $self->{target}->isStopWord($trg));
}
elsif(($check eq 'same_class') or ($check eq 'wordclass')){
return $self->isSameClass($src,$trg);
}
elsif(($check eq 'same_sub_class') or ($check eq 'subclass')){
return $self->isSameSubClass($src,$trg);
}
return 1;
}
sub isSameClass{
my $self=shift;
my ($src,$trg)=@_;
my $cat='stop word class hash';
my $SrcData=$self->{source}->getLanguageData($cat);
my $TrgData=$self->{target}->getLanguageData($cat);
if (ref($SrcData) ne 'HASH'){return 1;}
if (ref($TrgData) ne 'HASH'){return 1;}
foreach my $c (%{$SrcData}){
if (defined $$SrcData{$c}{$src}){
if (defined $$TrgData{$c}){
if (defined $$TrgData{$c}{$trg}){
return 1;
}
}
return 0;
}
}
foreach my $c (%{$TrgData}){
if (defined $$TrgData{$c}{$trg}){
return 0;
}
}
return 1;
}
sub isSameSubClass{
my $self=shift;
my ($src,$trg)=@_;
my $cat='stop word subclass hash';
my $SrcData=$self->{source}->getLanguageData($cat);
my $TrgData=$self->{target}->getLanguageData($cat);
if (ref($SrcData) ne 'HASH'){return 1;}
if (ref($TrgData) ne 'HASH'){return 1;}
foreach my $x (%{$SrcData}){
foreach my $y (%{$$SrcData{$x}}){
if (defined $$SrcData{$x}{$y}{$src}){
if (defined $$TrgData{$x}){
if (defined $$TrgData{$x}{$y}){
if (defined $$TrgData{$x}{$y}{$trg}){
return 1;
}
}
}
return 0;
}
}
}
foreach my $x (keys %{$TrgData}){
foreach my $y (keys %{$$TrgData{$x}}){
if (defined $$TrgData{$x}{$y}{$trg}){
return 0;
}
}
}
return 1;
}
sub lengthQuotient{
my $self=shift;
my ($src,$trg)=@_;
if (length($src)==0 or length($trg)==0) {return 0;}
if (length($src)>length($trg)) {return length($trg)/length($src);}
else {return length($src)/length($trg);}
}
sub makeParameter{
my $self=shift;
my ($param,$lang)=@_;
if (ref($param) ne 'HASH'){return;}
foreach (keys %{$param}){
if (/^(.*) \($lang\)/){
$param->{$1}=$param->{$_};
}
}
}
sub rmLinkedToken{
my $data=shift;
# my $srcData=Uplug::Data::Lang->new;
# my $trgData=Uplug::Data::Lang->new;
# $data->subData($srcData,'source');
# $data->subData($trgData,'target');
## $data->{'source'}=$data->subData('source');
## $data->{'target'}=$data->subData('target');
my $srcData=$data->{source};
my $trgData=$data->{target};
my $link=$data->{link};
my @nodes=$link->findNodes('wordLink');
my @xtrg=$link->attribute(\@nodes,'xtargets');
foreach my $l (@xtrg){
if ($l=~/^(.*\S)\s?\;\s?(\S.*)$/){
my ($s,$t)=($1,$2);
$data->rmToken($s,$srcData);
$data->rmToken($t,$trgData);
}
}
}
sub rmToken{
my $self=shift;
my ($span,$data)=@_;
my @token=split(/[\+\s]/,$span);
foreach (@token){
my ($node)=$data->findNodes('.*',{id => $_});
if (defined $node){
$node->getParentNode->removeChild($node);
$node->dispose();
}
}
}
sub rmWordLinks{
my $data=shift;
# $data->{link}->delAttribute('wordLink');
if (ref($data->{link})){
$data->{link}->delNodes('wordLink');
}
}
sub findLink{
my $self=shift;
my $link=shift;
my %attr=();
$attr{src}=$link->{source};
$attr{trg}=$link->{target};
my @nodes=$self->{link}->findNodes('wordLink',\%attr);
if (@nodes){
return @nodes;
}
return undef;
}
sub addWordLink{
my $data=shift;
my $link=shift;
my $OutData=$data->{link};
if (defined $data->findLink($link)){return;}
my %attr=();
if (defined $link->{score}){
$attr{certainty}=$link->{score};
}
$attr{lexPair}=$link->{link};
$attr{xtargets}="$link->{source};$link->{target}";
$attr{xtargets}=~tr/:/+/;
if ($link->{src} and $link->{trg}){
$link->{src}=~tr/\&/\+/;
$link->{trg}=~tr/\&/\+/;
$attr{span}="$link->{src};$link->{trg}"
}
my $wordLink=$OutData->createNode('wordLink',\%attr);
$OutData->addNode($wordLink);
# if (defined $link->{step}){$attr{step}=$link->{step};}
# $attr{'id'}=$id;
# $attr{'content'}="\n$src:$trg\n";
}
sub toHTML{
my $self=shift;
my $html=$self->{source}->toHTML();
$html.=$self->{target}->toHTML();
}