The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
#
# Copyright (C) 2004 Jörg Tiedemann  <joerg@stp.ling.uu.se>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# $Id: readalign,v 1.6 2009/06/10 21:58:01 joerg72 Exp $
#
# usage: readalign [-m max] [-h] xces-file
#
#  -m max: show <max> number of sentence alignments
#  -h    : print html
#

=head1 NAME

uplug-readalign - read sentence alignment in XCES align format

=head1 SYNOPSIS

 # read sentence alignments and print aligned sentences
 uplug-readalign align-file.xml

 # print alignments with alignment certainty > LinkThr=0
 uplug-readalign -c 0 align-file.xml

 # print alignments with max 2 source sentences and 3 target sentences
 uplug-readalign -S 2 -T 3 align-file.xml

 # print aligned sentences marked as 'de' (source) and 'en' (target)
 # (this only works if sentences are marked with languages:
 #  for example, in the German XML file: <s lang="de">...</s>)
 uplug-readalign -s de -t en align-file.xml

 # wrap aligned sentences in simple HTML
 uplug-readalign -h align-file.xml

 # print max 10 alignments
 uplug-readalign -m 10 align-file.xml

 # specify home directory of aligned XML files
 uplug-readalign -d /path/to/xml/files align-file.xml

 # print XCES align format of all 1:1 sentence alignments
 uplug-readalign -S 1 -T 1 -l align-file.xml

=head1 USAGE

 uplug-readalign [OPTIONS] align-file.xml

=head1 OPTIONS

 -c <thr> ........... set a link threshold <thr>
 -d <dir> ........... set home directory for aligned XML documents
 -h ................. print simple HTML
 -l ................. print links (filter mode)
 -m <max> ........... print max <max> alignments
 -s <LangID> ........ require source sentences to match <LangID>
 -t <LangID> ........ require target sentences to match <LangID>
 -S <max> ........... maximum number of source sentence in alignments
 -T <max> ........... maximum number of target sentence in alignments

=head1 DESCRIPTION

C<uplug-readalign> is a simple script to read sentence alignments stored in XCES align format and prints the aligned sentences to STDOUT. It requires monolingual alignments (ascending order, no crossing links) of sentences in linked XML files. Linked XML files are specified in the C<toDoc> and <fromDoc> attributes (see below).

 <cesAlign version="1.0">
  <linkGrp targType="s" toDoc="source1.xml" fromDoc="target1.xml">
    <link certainty="0.88" xtargets="s1.1 s1.2;s1.1" id="SL1" />
    ....
  <linkGrp targType="s" toDoc="source2.xml" fromDoc="target2.xml">
    <link certainty="0.88" xtargets="s1.1;s1.1" id="SL1" />

Several parameters can be set to filter the alignments and to print only certain types of alignments.

C<uplug-readalign> can also be used to filter the XCES alignment files and to print the remaining links in the same XCES align format. Use the C<-l> flag to enable this mode.

=head1 See also

More information on Uplug: Look at L<Uplug::Config>

More downloads:
L<https://bitbucket.org/tiedemann/uplug>

=cut


use strict;
use FindBin qw($Bin);

my $html=0;
my $max=0;
my $SrcID=undef;
my $TrgID=undef;
my $MaxSrc=undef;
my $MaxTrg=undef;
my $LinkThr=undef;
my $FilterMode=0;    # filter-mode: print alignment XML

my $dir='xml';                   # extra directory to check for from/toDoc
while ($ARGV[0]=~/^\-/){
    my $o=shift(@ARGV);
    if ($o=~/^\-h/){$html=1;}
    elsif ($o=~/^\-m/){$max=shift @ARGV;}
    elsif ($o=~/^\-d/){$dir=shift @ARGV;}
    elsif ($o=~/^\-s/){$SrcID=shift @ARGV;}
    elsif ($o=~/^\-t/){$TrgID=shift @ARGV;}
    elsif ($o=~/^\-S/){$MaxSrc=shift @ARGV;}
    elsif ($o=~/^\-T/){$MaxTrg=shift @ARGV;}
    elsif ($o=~/^\-c/){$LinkThr=shift @ARGV;}
    elsif ($o=~/^\-l/){$FilterMode=1;}
}

my $ALIGN=shift(@ARGV);

my $srcdoc='';
my $trgdoc='';

if ((not -e "$ALIGN") and (-e "$ALIGN.gz")){$ALIGN="$ALIGN.gz";}
if (not -e $ALIGN){die "Alignment file $ALIGN does not exist!\n";}

if ($ALIGN=~/\.gz/){
    open F,"gzip -cd <$ALIGN |";
}
else{
    open F,"<$ALIGN";
}

if ($html){&PrintHtmlHeader();}

my $firstSrc=1;
my $firstTrg=1;

my $count=0;
while (<F>){
    if (/fromDoc=\"([^\"]+)\"/){
	if ($srcdoc ne $1){
	    $srcdoc=$1;
	    if (not $firstSrc){close SRC;}
	    if ((not -e $srcdoc) and (-e "$srcdoc.gz")){
		$srcdoc="$srcdoc.gz";
	    }
	    if ((not -e $srcdoc) and (-e "$dir/$srcdoc")){
		$srcdoc="$dir/$srcdoc";
	    }
	    if ((not -e $srcdoc) and (-e "$dir/$srcdoc.gz")){
		$srcdoc="$dir/$srcdoc.gz";
	    }
	    if ($srcdoc=~/\.gz$/){
		open SRC,"gzip -cd <$srcdoc |";
	    }
	    else{
		open SRC,"<$srcdoc";
	    }
	    $firstSrc=0;
	}
    }
    if (/toDoc=\"([^\"]+)\"/){
	if ($trgdoc ne $1){
	    $trgdoc=$1;
	    if (not $firstTrg){close TRG;}
	    if ((not -e $trgdoc) and (-e "$trgdoc.gz")){
		$trgdoc="$trgdoc.gz";
	    }
	    if ((not -e $trgdoc) and (-e "$dir/$trgdoc")){
		$trgdoc="$dir/$trgdoc";
	    }
	    if ((not -e $trgdoc) and (-e "$dir/$trgdoc.gz")){
		$trgdoc="$dir/$trgdoc.gz";
	    }
	    if ($trgdoc=~/\.gz$/){
		open TRG,"gzip -cd <$trgdoc |";
	    }
	    else{
		open TRG,"<$trgdoc";
	    }
	    $firstTrg=0;
	    unless ($FilterMode){
		if ($html){print "<p>\n";}
		print "\n# ".$srcdoc;
		if ($html){print '<br>';}
		print "\n# ".$trgdoc."\n\n";
		if ($html){print "<p><hr>\n";}
		else{print "================================\n";}
	    }
	}
    }
    if (/xtargets=\"([^\"]*)\s*\;\s*([^\"]*)\"/){
	if (defined $LinkThr){
	    if (/certainty=\"(.*?)\"/){
		next if ($1<$LinkThr);
	    }
	}
	my $srceof=1;
	my $trgeof=1;
	$count++;
	if ($max and ($count>$max)){last;}
	my $src=$1;
	my $trg=$2;
	my @srcsent=split(/\s/,$src);
	my @trgsent=split(/\s/,$trg);

	if (defined $MaxSrc){
	    next if (scalar @srcsent > $MaxSrc);
	}
	if (defined $MaxTrg){
	    next if (scalar @trgsent > $MaxTrg);
	}

	my $SrcStr='';
	my $TrgStr='';

	my $oldDel=$/;
	$/='</s>';
SRCSENT: foreach (@srcsent){
	    while (my $sent=<SRC>){
		$srceof=0;
		if ($sent=~/s [^\>]*id="$_"/s){
		    if ($SrcID && $sent=~/lang=\".*?\"/){
			next SRCSENT unless ($sent=~/lang=\"$SrcID\"/);
		    }
		    $sent=~s/^.*<s [^\>]*id/(src)/s;
		    $sent=~s/\n/ /gs;
		    $sent=~s/\<[^\>]*>//gs;
		    $sent=~s/  +/ /gs;
		    if ($html){$sent=&Str2Html($sent);}
		    else{
			$sent=~s/\&gt\;/\>/gs;
			$sent=~s/\&lt\;/\</gs;
			$sent=~s/\&amp\;/\&/gs;
		    }
		    $SrcStr.=$sent;
		    if ($html){$SrcStr.="<br>";}
		    $SrcStr.="\n";
		    last;
		}
		$srceof=1;
	    }
	}

TRGSENT: foreach (@trgsent){
	    while (my $sent=<TRG>){
		$trgeof=0;
		if ($sent=~/s [^\>]*id="$_"/s){
		    if ($TrgID && $sent=~/lang=\".*?\"/){
			next TRGSENT unless ($sent=~/lang=\"$TrgID\"/);
		    }
		    $sent=~s/^.*<s [^\>]*id/(trg)/s;
		    $sent=~s/\n/ /gs;
		    $sent=~s/\<[^\>]*>//gs;
		    $sent=~s/  +/ /gs;
		    if ($html){$sent=&Str2Html($sent);}
		    else{
			$sent=~s/\&gt\;/\>/gs;
			$sent=~s/\&lt\;/\</gs;
			$sent=~s/\&amp\;/\&/gs;
		    }
		    $TrgStr.=$sent;
		    if ($html){$TrgStr.="<br>";}
		    $TrgStr.="\n";
		    last;
		}
		$trgeof=1;
	    }
	}
        if ($trgeof){
            close TRG;
            if ($trgdoc=~/\.gz$/){open TRG,"gzip -cd <$trgdoc |";}
            else{open TRG,"<$trgdoc";}
        }
        if ($srceof){
            close SRC;
            if ($srcdoc=~/\.gz$/){open SRC,"gzip -cd <$srcdoc |";}
            else{open SRC,"<$srcdoc";}
        }
	$/=$oldDel;
	if ($SrcStr && $TrgStr){
	    unless ($FilterMode){
		print $SrcStr;
		print $TrgStr;
		if ($html){print "<hr>\n";}
		else{print "================================\n";}
	    }
	}
	else{ next; }
    }
    print $_ if ($FilterMode);
}


if (not $firstSrc){close SRC;}
if (not $firstTrg){close TRG;}
close F;

if ($html){&PrintHtmlTail();}

sub Str2Html{
    my $string=shift;
#    $string=~s/\&/\&amp\;/gs;
#    $string=~s/\</\&lt\;/gs;
#    $string=~s/\>/\&gt\;/gs;
    return $string;
}

sub PrintHtmlHeader{
    print <<HEADER;
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
"http://www.w3.org/TR/REC-html40/loose.dtd"> 
<html>
<head>
<title>Untitled Document</title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
</head>
<body>
HEADER
}


sub PrintHtmlTail{
    print <<TAIL;
</body>
</html>
TAIL
}