The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#-*-perl-*-
#####################################################################
# Copyright (C) 2004 Jörg Tiedemann  <joerg@stp.ling.uu.se>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

#####################################################################
#
# XCESalign
#
#####################################################################
# $Author$
# $Id$


package Uplug::IO::XCESalign;

use strict;
use vars qw(@ISA);
use vars qw(%StreamOptions);
use Uplug::IO::XML;
use Uplug::IO::Any;
use Uplug::Data;
# use Uplug::Data::DOM;

@ISA = qw( Uplug::IO::XML );

# stream options and their default values!!

%StreamOptions = ('DocRootTag' => 'cesAlign',
		  'root' => 'link',
		  'DataStructure' => 'complex',
		  'SkipDataHeader' => 1,
		  'SkipDataTail' => 1,
		  'DTDname' => 'cesAlign',
		  'DTDsystemID' => 'dtd/xcesAlign.dtd',
		  'DTDpublicID' => '-//CES//DTD XML cesAlign//EN'
		  );
$StreamOptions{DocRoot}{version}='1.0';

sub new{
    my $class=shift;
    my $self=$class->SUPER::new($class);
    foreach (keys %StreamOptions){
	$self->setOption($_,$StreamOptions{$_});
    }
#    &Uplug::IO::AddHash2Hash($self->{StreamOptions},\%StreamOptions);
    return $self;
}


sub open{
    my $self            = shift;
    if ($self->SUPER::open(@_)){

	if ($self->{AccessMode} eq 'read'){              # if access-mode=read:
	    if (not $self->option('DocBodyTag')){        # set doc-body-tag
		$self->setOption('DocBodyTag','(linkGrp|linkAlign)');
	    }
	}
	if (defined $self->{StreamOptions}->{fromDoc}){
	    $self->OpenAlignDocs($self->{StreamOptions});
	}
	elsif (defined $self->{StreamHeader}->{fromDoc}){
	    $self->OpenAlignDocs($self->{StreamHeader});
	    $self->{StreamOptions}->{fromDoc}=$self->{StreamHeader}->{fromDoc};
	    $self->{StreamOptions}->{toDoc}=$self->{StreamHeader}->{toDoc};
	}
	elsif (defined $self->{StreamOptions}->{DocRoot}->{fromDoc}){
	    $self->OpenAlignDocs($self->{StreamOptions}->{DocRoot});
	    $self->{StreamOptions}->{fromDoc}=
		$self->{StreamOptions}->{DocRoot}->{fromDoc};
	    $self->{StreamOptions}->{toDoc}=
		$self->{StreamOptions}->{DocRoot}->{toDoc};
	}
	elsif (defined $self->{StreamHeader}->{DocRoot}->{fromDoc}){
	    $self->OpenAlignDocs($self->{StreamHeader}->{DocRoot});
	    $self->{StreamOptions}->{fromDoc}=
		$self->{StreamHeader}->{DocRoot}->{fromDoc};
	    $self->{StreamOptions}->{toDoc}=
		$self->{StreamHeader}->{DocRoot}->{toDoc};
	}

	if ($self->{AccessMode} ne 'read'){
	    if (($self->{fromDoc} ne 'exists') and (not ref($self->{source}))){
		$self->{StreamOptions}->{fromDoc}=
		    $self->{StreamOptions}->{file}.'.src';
	    }
	    if (($self->{toDoc} ne 'exists') and (not ref($self->{target}))){
		$self->{StreamOptions}->{toDoc}=
		    $self->{StreamOptions}->{file}.'.trg';
	    }
	    $self->OpenAlignDocs($self->{StreamOptions});

	    # write linkGrp-tag only of we skip data headers!
	    # otherwise: linkGrp is expected to be in the data-headers
	    #            of data written to cesAlign files!!

	    if ($self->{StreamOptions}->{SkipDataHeader}){
		$self->OpenTag('linkGrp',
			       {'targType' => 's',
				'fromDoc' => $self->{StreamOptions}->{fromDoc},
				'toDoc' => $self->{StreamOptions}->{toDoc}});
	    }
	}

	if (ref($self->{XmlParser})){
	    $self->{XmlHandle}->{REMOVESPACES}=$self->option('REMOVESPACES');
	    $self->{XmlHandle}->{SubTreeRoot}=$self->option('root');
	    $self->{XmlHandle}->{DocRootTag}=$self->option('DocRootTag');
	    $self->{XmlHandle}->{DocBodyTag}=$self->option('DocBodyTag');
	    $self->CompileTagREs;
	}

	return 1;
    }
    return 0;

}

sub close{
    my $self=shift;
    if (ref($self->{source})){
	$self->{source}->close();
    }
    if (ref($self->{target})){
	$self->{target}->close();
    }
    return $self->SUPER::close();
}


sub read{
    my $self=shift;
    my $data=shift;

    $data->init;
    if (not ref($data->{link})){$data->{link}=Uplug::Data->new;}
    else{$data->{link}->init;}

    my $ret=$self->SUPER::read($data->{link},@_);
    if ($self->NewDocBody){
	my $BodyAttr=$self->DocBodyAttr;
	$self->addheader($BodyAttr);
	if (not  $self->OpenAlignDocs($BodyAttr)){
	    return 0;
	}
	delete $data->{sourceSent};     # delete previous data-objects
	delete $data->{targetSent};     # (to get new XML::Parser instances)
    }

    my $attr=$data->{link}->attribute();
#     $data->addChild('align',undef,$attr);
    $data->addNode('align',$attr);
    my ($src,$trg);
    my $xtargets=$data->{link}->attribute('xtargets');
    if (defined $xtargets){
	($src,$trg)=split(/\s*\;\s*/,$xtargets);
    }
    else{return 0;}

    my @srcID=split(/\s/,$src);
    my @trgID=split(/\s/,$trg);

#----------- sentences ------------------------------

    &ReadSegments($self->{source},$data,\@srcID,'source');
    &ReadSegments($self->{target},$data,\@trgID,'target');

    $self->Uplug::IO::read($data);

#    $data->{source}=$data->subData('source');
#    $data->{target}=$data->subData('target');

#    $data->subTree($data->{source},'source');
#    $data->subTree($data->{target},'target');

    return $ret;

}




sub write{
    my $self=shift;
    my ($data)=@_;

    $self->Uplug::IO::write($data);

    my $ret;
    my $source=$data->subData('source');
    my $target=$data->subData('target');

#     my $source=Uplug::Data::DOM->new;
#    my $source=Uplug::Data->new;
#    $data->subTree($source,'source');
#    my $target=Uplug::Data->new;
#     my $target=Uplug::Data::DOM->new;
#    $data->subTree($target,'target');


    if (ref($data->{link})){
	$ret=$self->SUPER::write($data->{link});
    }
    elsif (defined $data->attribute('xtargets')){
	$ret=$self->SUPER::write($data);
    }
    else{
	my $tag=$self->option('root');
	my $link=Uplug::Data->new($tag);
	my @src=$source->findNodes('s');
	my @srcID=$source->attribute(\@src,'id');
	foreach (0..$#src){
	    if (not defined $srcID[$_]){
		$self->{srcID}++;
		$srcID[$_]=$self->{srcID};
		$source->setAttribute($src[$_],'id',$srcID[$_]);
	    }
	    $self->{srcID}=$srcID[$_];
	}
	my @trg=$target->findNodes('s');
	my @trgID=$target->attribute(\@trg,'id');
	foreach (0..$#trg){
	    if (not defined $trgID[$_]){
		$self->{trgID}++;
		$trgID[$_]=$self->{trgID};
		$target->setAttribute($trg[$_],'id',$trgID[$_]);
	    }
	    $self->{trgID}=$trgID[$_];
	}
	my $s=join(' ',@srcID);
	my $t=join(' ',@trgID);
	$link->setAttribute('xtargets',"$s\;$t",$tag);

	my $id=$data->attribute('id');
	if (not defined $id){
	    $self->{alignID}++;
	    $id=$self->{alignID};
	}
	$link->setAttribute('id',$id);
	$ret=$self->SUPER::write($link);
    }
    if (($self->{fromDoc} ne 'exists') and (ref($self->{source}))){
#	$source->moveAttribute('s','seg');
	$self->{source}->write($source);
    }
    if (($self->{toDoc} ne 'exists') and (ref($self->{target}))){
#	$target->moveAttribute('s','seg');
	$self->{target}->write($target);
    }
    return $ret;
}



#-----------------------------------------------------------------
# select: select data
#
# * read sequentially through the link-file and
#   search bitext segments according to the matching pattern
#
# we cannot use select for searching in the link file because
# we need fromDoc and toDoc from the alignment tags .....!!!!


sub select{
    my $self=shift;
    my ($data,$pattern,$attr,$operator)=@_;

    $data->init;
    if (not ref($data->{link})){$data->{link}=Uplug::Data->new;}
    else{$data->{link}->init;}

    my %linkPattern=();
    my %srcPattern=();
    my %trgPattern=();
    if (ref($pattern) eq 'HASH'){
	%linkPattern=%{$pattern};
	if (ref($pattern->{source}) eq 'HASH'){
	    %srcPattern=%{$pattern->{source}};
	    delete $linkPattern{source};
	}
	elsif (defined $pattern->{source}){
	    $srcPattern{'#text'}=$pattern->{source};
	    delete $linkPattern{source};
	}
	if (ref($pattern->{target}) eq 'HASH'){
	    %trgPattern=%{$pattern->{target}};
	    delete $linkPattern{target};
	}
	elsif (defined $pattern->{target}){
	    $trgPattern{'#text'}=$pattern->{target};
	    delete $linkPattern{target};
	}
    }

    if ($self->{ENDOFSTREAM}){
	delete $self->{ENDOFSTREAM};
	$self->reopen();
    }

    while ($self->SUPER::read($data->{link})){

	print '';
	if ($self->NewDocBody){
	    my $BodyAttr=$self->DocBodyAttr;
	    $self->addheader($BodyAttr);
	    if (not  $self->OpenAlignDocs($BodyAttr)){
		return 0;
	    }
	    delete $data->{sourceSent};    # delete previous data-objects
	    delete $data->{targetSent};    # (to get new XML::Parser instances)
	}

	if (not $data->{link}->matchData(\%linkPattern,$operator)){next;}


	my $attr=$data->{link}->attribute();
	$data->addNode('align',$attr);
	my ($src,$trg);
	my $xtargets=$data->{link}->attribute('xtargets');

	if (defined $xtargets){
	    ($src,$trg)=split(/\s*\;\s*/,$xtargets);
	}
	else{return 0;}

	my @srcID=split(/\s/,$src);
	my @trgID=split(/\s/,$trg);

        #----------- sentences ------------------------------

#	print STDERR "read src sentences ",join(":",@srcID),"\n";
#	print STDERR "read trg sentences ",join(":",@trgID),"\n";

	&SearchSegments($self->{source},$data,\@srcID,'source');
	&SearchSegments($self->{target},$data,\@trgID,'target');

#	&ReadSegments($self->{source},$data,\@srcID,'source');
#	&ReadSegments($self->{target},$data,\@trgID,'target');

	$self->Uplug::IO::read($data);
	return 1;
    }

    $self->{ENDOFSTREAM}=1;
    return 0;
}


# end of select
#-----------------------------------------------------------------




sub FindAlignDocFile{
    my $self=shift;
    my $doc=shift;
    if ((-s $doc) or (-s "$doc.gz")){return $doc;}   # found it --> ok!

    my $file=$self->files();                            # no?
    my $dir='./';                                       # check relative to the
    if ($file=~/^(.*[\\\/])[^\\\/]+$/){$dir=$1;}        # align-document
    if ((-s $dir.$doc) or (-s "$dir$doc.gz")){          # (align-dir/filename)
#	print STDERR "$doc --> $dir$doc\n";
	if ($doc=~/^([\\\/\.]*[^\\\/\.]+)[\\\/]/){      # make symbolic links!
	    eval { symlink ($dir.$1,$1);1 };            # ... relative dir
	}                                               # ... or
	else{eval { symlink ($dir.$doc,$doc);1 };}      # ... file link
	return $dir.$doc;                               # return path+file
    }
    my $docfile=$doc;                                   # no?
    if ($doc=~/^.*[\\\/]([^\\\/]+)$/){$docfile=$1;}     # remove path in the
    if ((-s $dir.$docfile) or (-s "$dir$docfile.gz")){  # filename and check
#	print STDERR "$doc --> $dir$docfile\n";
	if ($doc=~/^([\\\/\.]*[^\\\/\.]+)[\\\/]/){
	    eval { symlink ($dir.$1,$1);1 };
	}
	else{eval { symlink ($dir.$docfile,$docfile);1 };}
	return $dir.$docfile;                            # found it --> return!
    }                                                      # no? --> remove
    if ($doc=~/^[^\\\/]+[\\\/]([^\\\/].*)$/){$docfile=$1;} # initial directory
    if ((-s $dir.$docfile) or (-s "$dir$docfile.gz")){     # and check again!
#	print STDERR "$doc --> $dir$docfile\n";
	if ($doc=~/^([\\\/\.]*[^\\\/\.]+)[\\\/]/){
	    eval { symlink ($dir,$1);1 };
#	    print STDERR "symlink ($dir,$1)\n";
	}
	else{eval { symlink ($dir.$docfile,$docfile);1 };}
#	print STDERR "symlink ($dir.$docfile,$docfile)\n";
	return $dir.$docfile;
    }
    return $doc;
}



sub OpenAlignDocs{
    my $self=shift;
    my $options=shift;


    if (ref($options) ne 'HASH'){return 0;}
    if (($self->{AccessMode} ne 'read') and 
	((-s $options->{fromDoc}) or (-s "$options->{fromDoc}.gz") or
	 ($self->{StreamOptions}->{SkipSrcFile}))){
	$self->{fromDoc}='exists';
    }
    else{
	$options->{fromDoc}=$self->FindAlignDocFile($options->{fromDoc});
	my %stream=('file' => $options->{fromDoc},
		    'format' => 'XML',
		    'root' => 's');

	## make subtree index files (DBM hash) for the source file
	##
	# if ($self->option('MAKESUBTREEINDEX')){  # commented out -> always!
	    $stream{MAKESUBTREEINDEX} = 1;
	# }


	if ($self->{AccessMode} ne 'read'){$stream{DocRootTag}='document';}
	$self->{source}=Uplug::IO::Any->new(\%stream);
	if (not $self->{source}->open($self->{AccessMode},\%stream)){
	    return 0;
	}
    }
    if (($self->{AccessMode} ne 'read') and
	((-s $options->{toDoc}) or (-s "$options->{toDoc}.gz") or
	 ($self->{StreamOptions}->{SkipTrgFile}))){
	$self->{toDoc}='exists';
    }
    else{
	$options->{toDoc}=$self->FindAlignDocFile($options->{toDoc});
	my %stream=('file' => $options->{toDoc},
		    'format' => 'XML',
		    'root' => 's');

	## make subtree index files (DBM hash) for the source file
	##
	# if ($self->option('MAKESUBTREEINDEX')){  # commented out -> always!
	    $stream{MAKESUBTREEINDEX} = 1;
	# }

	if ($self->{AccessMode} ne 'read'){$stream{DocRootTag}='document';}
	$self->{target}=Uplug::IO::Any->new(\%stream);
	if (not $self->{target}->open($self->{AccessMode},\%stream)){
	    return 0;
	}
    }
    return 1;
}



#-------------------------------------------------------------------------
# ReadSegments: read segments from the aligned XML-files (sentences)
#
#  * read sequentially through the data and add sentences with the 
#    requested IDs
#  * if the ID's don't match --> use the select function for IO::XML
#    (this is probably also just a sequential read -> might be a problem!)

sub ReadSegments{

    my ($stream,$data,$IDs,$lang)=@_;

    if (not ref($stream)){return;}
    if (not @{$IDs}){return;}

    #--------------------------------------------------------------
    if (not ref($data->{$lang})){
	$data->{$lang}=Uplug::Data::Lang->new();  # a new language object
    }
    if (ref($data->{$lang.'Sent'}) ne 'ARRAY'){   # the array of data objects
	$data->{$lang.'Sent'}=[];                 # (one for each sentence)
    }
    #--------------------------------------------------------------

    my $parent=$data->addNode($lang);     # set root node = $lang
    $data->{$lang}->setRoot($parent);     # set root node of sub-lang-data

    my $count = 0;                                  # make a new data object
    if (not ref($data->{$lang.'Sent'}->[$count])){  # for reading the sentence
	$data->{$lang.'Sent'}->[$count]=
	    Uplug::Data->new();
    }
    my $sent = $data->{$lang.'Sent'}->[$count];     # $sents points to it!

    while ($stream->read($sent)){          # read sequentially through the data
	my $id=$sent->attribute('id');     # get the sentence ID
	if (not grep ($_ eq $id,@{$IDs})){ # if s-ID is not in the requested:
	    foreach (0..$#{$IDs}){         # use the select-function

		if (not ref($data->{$lang.'Sent'}->[$_])){
		    $data->{$lang.'Sent'}->[$_]=Uplug::Data->new();
		}
		my $sent = $data->{$lang.'Sent'}->[$_];
		if ($stream->select($sent,{id => $IDs->[$_]})){
		    my $node=$sent->root();
		    $data->addNode($parent,$node);
		}
	    }
	    return;
	}
	my $node=$sent->root();            # otherwise: add the sentence
	$data->addNode($parent,$node);     # and continue to read if necessary
	if ($id eq $IDs->[-1]){last;}

	$count ++;                                      # still more sentences!
	if (not ref($data->{$lang.'Sent'}->[$count])){  # make a new data-
	    $data->{$lang.'Sent'}->[$count]=            # object if necessary
		Uplug::Data->new();
	}
	$sent = $data->{$lang.'Sent'}->[$count];        # let $sent point to it
    }
}



#-------------------------------------------------------------------------
# SearchSegments: search sentences in the aligned XML files
#
# * similar to ReadSegments but uses the select function in IO::XML
#   as standard (does not call data->read at all!)
# * 'select' is just reading sequentially through the data at the moment
#   and could actually be used even for ReadSegments (??!)


sub SearchSegments{

    my ($stream,$data,$IDs,$lang)=@_;

    if (not ref($stream)){return;}
    if (not @{$IDs}){return;}

    #--------------------------------------------------------------
    if (not ref($data->{$lang})){
	$data->{$lang}=Uplug::Data::Lang->new();  # a new language object
    }
    if (ref($data->{$lang.'Sent'}) ne 'ARRAY'){   # the array of data objects
	$data->{$lang.'Sent'}=[];                 # (one for each sentence)
    }
    #--------------------------------------------------------------

    my $parent=$data->addNode($lang);     # set root node = $lang
    $data->{$lang}->setRoot($parent);     # set root node of sub-lang-data

    foreach (0..$#{$IDs}){
	if (not ref($data->{$lang.'Sent'}->[$_])){
	    $data->{$lang.'Sent'}->[$_]=Uplug::Data->new();
	}
	my $sent = $data->{$lang.'Sent'}->[$_];
	if ($stream->select($sent,{id => $IDs->[$_]})){
	    my $node=$sent->root();
	    $data->addNode($parent,$node);
	}
    }
}



#-------------------------------------------------------------------------
# ReadSegments: old version of ReadSegments
#
# * reads sentences only if the IDs match OR the current ID is
#   LOWER than the FIRST sentence ID in the list of requested onces
# * advantage: does not use the select function from IO::XML which is
#   right just a sequential search through the file! this may cause the
#   program to read through the whole file without finding anything, and
#   this is slow
#   (this problem appears if there is an requested ID which is LOWER than
#    the one at the current file position, or if the requested ID does not
#    exist in the file)
# * use this one instead of the one above by removing the 'Old' in the
#   sub-name


sub ReadSegmentsOld{

    my ($stream,$data,$IDs,$lang)=@_;

    if (not ref($stream)){return;}
    if (not @{$IDs}){return;}
    my $i=0;

    #--------------------------------------------------------------
    if (not ref($data->{$lang})){
	$data->{$lang}=Uplug::Data::Lang->new();  # a new language object
    }
    if (not ref($data->{$lang.'Sent'})){
	$data->{$lang.'Sent'}=Uplug::Data->new(); # a new object for reading
    }
    #--------------------------------------------------------------

    my $parent=$data->addNode($lang);     # set root node = $lang
    $data->{$lang}->setRoot($parent);     # set root node of sub-lang-data
    my $sent=$data->{$lang.'Sent'};       # sentences will be read into $sent

    my @start=split(/\./,$IDs->[0]);  # split start ID (can be like 's3.2.5.2')
    map(s/[^0-9]//,@start);           # delete non-digits

    my $fail=0;
    while ($stream->read($sent)){          # read sequentially through the data
	my $id=$sent->attribute('id');     # get the sentence ID
	if (not grep ($_ eq $id,@{$IDs})){ # if s-ID is not in the requested

	    my @nr=split(/\./,$id);        # split into ID-levels
	    map(s/[^0-9]//,@nr);           # delete non-digits

	    foreach my $l (0..$#start){    # compare each ID level
		if ($nr[$l]<$start[$l]){   # if the current ID is lower:
		    last;                  #    OK! continue reading! (we don't
		}                          #    have to check deeper levels!)
		if ($nr[$l]>$start[$l]){   # if larger than the start-ID
		    $fail++;last;          # --> fail and stop
		}
	    }
	    if ($fail>1){last;}            # allow to fail once (why ?!?!)
#		$stream->close;            # we could also re-open: takes too
#		$stream->open('read');     #  much time! ... ignore it for now!
	    next;
	}
	my $node=$sent->root();
	$data->addNode($parent,$node);
	if ($id eq $IDs->[-1]){last;}
	$i++;
    }
}