The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
#-*-perl-*-
#
# convert sentence aligned bitexts to factored moses input
# (requires XML::Parser)
#
# opus2moses.pl [OPTIONS] < sentence-align-file.xml
#
# OPTIONS:
#
# -s srcfactors ......... specify source language factors besides surface words
# -t trgfactors ......... the same for the target language (separated by ':')
#                         factors should be attributes of <w> tags!!
#                         (except 'word' which is the word itself)
# -d dir ................ home directory of the OPUS subcorpus
# -n file-pattern ....... skip bitext files that match pattern (e.g. ep-00-1*)
# -i .................... inverse selection (only files matching file pattern)
# -e src-data-file ...... output file for source language data (default = src)
# -f src-data-file ...... output file for target language data (default = trg)
#
# -p sentence-pair-file . stores sentence ID pairs of the extracted pairs
# -l .................... convert to lower case
# -1 .................... 1:1 links only
# -x max ................ max size of sentences (in nr of words)
#

use strict;
use XML::Parser;
use FileHandle;
use File::Basename;


use vars qw($opt_s $opt_t $opt_d $opt_n $opt_i $opt_e $opt_f $opt_h $opt_p $opt_l $opt_1 $opt_x $opt_r $opt_m);
use Getopt::Std;

getopts('s:t:d:n:ie:f:hp:l1x:rm');

if ($opt_h){
    print <<"EOH";

 opus2moses.pl [OPTIONS] < sentence-align-file.xml

 convert sentence aligned bitexts to factored moses input
 (requires XML::Parser)

 OPTIONS:

 -s srcfactors ......... specify source language factors besides surface words
 -t trgfactors ......... the same for the target language (separated by ':')
                         factors should be attributes of <w> tags!!
                         (except 'word' which is the word itself)
 -d dir ................ home directory of the OPUS subcorpus
 -n file-pattern ....... skip bitext files that match pattern (e.g. ep-00-1*)
 -i .................... inverse selection (only files matching file pattern)
 -e src-data-file ...... output file for source language data (default = src)
 -f src-data-file ...... output file for target language data (default = trg)
 -1 .................... extract 1:1 sentence alignments only
 -x max ................ maximum size of selected sentences (default=80)

EOH
    exit;
}


my $CORPUSHOME   = $opt_d || $ENV{HOME}.'/projects/OPUS/corpus/Europarl3';
my $SRCFACTORSTR = $opt_s || "word";
my $TRGFACTORSTR = $opt_t || "word";

# my $SRCFACTORSTR = $opt_s || "lem:tree";
# my $TRGFACTORSTR = $opt_t || "lem:tree";

my $SRCOUTFILE   = $opt_e || 'src';
my $TRGOUTFILE   = $opt_f || 'trg';

my @SrcFactors = split(/:/,$SRCFACTORSTR);
my @TrgFactors = split(/:/,$TRGFACTORSTR);

my $MAX = $opt_x || 100;

## make XML parser object for parsing the sentence alignment file

my $BitextParser = new XML::Parser(Handlers => {Start => \&AlignTagStart,
						End => \&AlignTagEnd});
my $BitextHandler = $BitextParser->parse_start;

## global variables for the source and target language XML parsers

my ($SrcParser,$TrgParser);
my ($SrcHandler,$TrgHandler);

my ($SRC,$TRG);         # filehandles for reading bitexts

## open output files

my $SRCOUT = new FileHandle;
my $TRGOUT = new FileHandle;

unless ($opt_m){
    $SRCOUT->open("> $SRCOUTFILE");
    $TRGOUT->open("> $TRGOUTFILE");

    binmode($SRCOUT, ":utf8");
    binmode($TRGOUT, ":utf8");
}


if ($opt_p){
    open P,">$opt_p" || warn "cannot open $opt_p ...\n";
}

## use '>' as input delimiter when reading (usually end of XML tag)

$/='>';

## read through sentence alignment file and parse XML
## - sub routines for reading source and target language corpora are
##   called from XML handlers connected to this XML parser object
## - aligned sentences have to be in the same order in the corpus files
##   as they appear in the sentence alignment file!

while (<>){
    eval { $BitextHandler->parse_more($_); };
    if ($@){
	warn $@;
	print STDERR $_;
    }
}

$SRCOUT->close();
$TRGOUT->close();

if ($opt_p){
    close P;
}


## finished!
##--------------------------------------------------------------------------




## open source and target corpus files (could be gzipped)
## create new XML parser objects and start parsing

sub OpenCorpora{
    my ($srcfile,$trgfile)=@_;

    if ((! -e "$CORPUSHOME/$srcfile") && (-e "$CORPUSHOME/$srcfile.gz")){
	$srcfile.='.gz';
    }
    if ((! -e "$CORPUSHOME/$trgfile") && (-e "$CORPUSHOME/$trgfile.gz")){
	$trgfile.='.gz';
    }

    if ((! -e "$CORPUSHOME/$srcfile") && ($srcfile=~/xml\//)){	
	$srcfile=~s/xml\///;
	return OpenCorpora($srcfile,$trgfile);
    }
    if ((! -e "$CORPUSHOME/$trgfile") && ($trgfile=~/xml\//)){	
	$trgfile=~s/xml\///;
	return OpenCorpora($srcfile,$trgfile);
    }

    ## check if file names match pattern of files to be skipped
    if (defined $opt_n){
	if ($opt_i){
	    if ($srcfile!~/$opt_n/ && $trgfile!~/$opt_n/){
		print "skip $srcfile-$trgfile\n";
		return 0;
	    }
	}
	elsif ($srcfile=~/$opt_n/ || $trgfile=~/$opt_n/){
	    print "skip $srcfile-$trgfile\n";
	    return 0;
	}
    }

    print STDERR "open bitext $srcfile <-> $trgfile\n";

    ## open filehandles to read from
    $SRC = new FileHandle;
    $TRG = new FileHandle;
    if ($srcfile=~/\.gz$/){$SRC->open("gzip -cd < $CORPUSHOME/$srcfile |");}
    else{$SRC->open("< $CORPUSHOME/$srcfile");}
    if ($trgfile=~/\.gz$/){$TRG->open("gzip -cd < $CORPUSHOME/$trgfile |");}
    else{$TRG->open("< $CORPUSHOME/$trgfile");}

    $SrcParser = new XML::Parser(Handlers => {Start => \&XmlTagStart,
					      End => \&XmlTagEnd,
					      Default => \&XmlChar});
    $SrcHandler = $SrcParser->parse_start;
    $SrcHandler->{OUT} = $SRCOUT;
    @{$SrcHandler->{FACTORS}} = @SrcFactors;
    $TrgParser = new XML::Parser(Handlers => {Start => \&XmlTagStart,
					      End => \&XmlTagEnd,
					      Default => \&XmlChar});
    $TrgHandler = $TrgParser->parse_start;
    $TrgHandler->{OUT} = $TRGOUT;
    @{$TrgHandler->{FACTORS}} = @TrgFactors;

    # multiple file output (option -m): create new output files for each
    # aligned document (use file basenames and the current directory)
    # (use opt_e/opt_f as extension or 'src', 'trg' as default)
    if ($opt_m){
	my $srcout = basename($srcfile);
	my $trgout = basename($trgfile);
	$srcout =~s/\.xml(\.gz)?/.$SRCOUTFILE/;
	$trgout =~s/\.xml(\.gz)?/.$TRGOUTFILE/;

	$SRCOUT->open("> $srcout");
	$TRGOUT->open("> $trgout");
	binmode($SRCOUT, ":utf8");
	binmode($TRGOUT, ":utf8");
    }

}

sub CloseCorpora{
    $SRC->close() if (defined $SRC);
    $TRG->close() if (defined $TRG);
}



## read from file handle and parse XML with corpus parser object
## idstr should be space-delimitered string of sentence IDs
## read until all sentences (all IDs) are found!
## --> all sentences have to exist and have to appear in the same order
##     they are specified in idstr (no crossing links whatsoever!)

sub ParseSentences{
    my ($idstr,$handle,$fh)=@_;
    my @ids=split(/\s+/,$idstr);
    @{$handle->{IDS}}=@ids;

    return if not @{$handle->{IDS}};

    while (<$fh>){
	eval { $handle->parse_more($_); };
	if ($@){
	    warn $@;
	    print STDERR $_;
	}
	return 1 if ($handle->{CLOSEDSID} eq $ids[-1]);
    }
}



##-------------------------------------------------------------------------
## XML parser handlers for sentence alignment parser


## XML opening tags
## - linkGrp --> open a new bitext (source & target corpus file)
## - link    --> a new sentence alignment: read sentences from source & target

sub AlignTagStart{
    my ($p,$e,%a)=@_;

    if ($e eq 'linkGrp'){
	if ($opt_p){
	    print P "## $a{fromDoc}\t$a{toDoc}\n";
	}
	return &OpenCorpora($a{fromDoc},$a{toDoc});
    }

    if ($e eq 'link'){
	## only if bitext filehandles are defined ...
	if (defined $SRC && defined $TRG){
	    my ($src,$trg) = split(/\s*\;\s*/,$a{xtargets});
	    if ($src=~/\S/ && $trg=~/\S/){
		if ($opt_1){
		    return if ($src=~/\S\s\S/);
		    return if ($trg=~/\S\s\S/);
		}

		$SrcHandler->{OUTSTR}='';   # reset output string
		$TrgHandler->{OUTSTR}='';

		&ParseSentences($src,$SrcHandler,$SRC);
		&ParseSentences($trg,$TrgHandler,$TRG);

		unless ($opt_r){
		    # skip if no words found
		    return if (not $SrcHandler->{NRWORDS});
		    return if (not $TrgHandler->{NRWORDS});
		    # skip if sentences are too long
		    return if ($SrcHandler->{NRWORDS} > $MAX);
		    return if ($TrgHandler->{NRWORDS} > $MAX);
		    # skip if ratio<=8 (a bit more restrictive than clean_corpus)
		    return if ($SrcHandler->{NRWORDS}/$TrgHandler->{NRWORDS}>8);
		    return if ($TrgHandler->{NRWORDS}/$SrcHandler->{NRWORDS}>8);
#		    # skip if ratio<=9 (like in clean_corpus for MOSES)
#		   return if ($SrcHandler->{NRWORDS}/$TrgHandler->{NRWORDS}>9);
#		   return if ($TrgHandler->{NRWORDS}/$SrcHandler->{NRWORDS}>9);
		   # skip if one of the strings is empty (shouldn't happen!)
#		   return if ($SrcHandler->{OUTSTR}=~/^\s*$/);
#		   return if ($TrgHandler->{OUTSTR}=~/^\s*$/);
		}
		if ($SrcHandler->{OUTSTR}=~/^\s*$/){
		    unless ($opt_r){
			print STDERR "empty src string but $SrcHandler->{NRWORDS} words! --> strange ... (trg = $TrgHandler->{NRWORDS} words)\n";
		    }
		    return;
		}
		if ($TrgHandler->{OUTSTR}=~/^\s*$/){
		    unless ($opt_r){
			print STDERR "empty trg string but $TrgHandler->{NRWORDS} words! --> strange ... (src = $SrcHandler->{NRWORDS} words)\n";
		    }
		    return;
		}

		if ($opt_r){
		    $SrcHandler->{OUTSTR} = &normalize($SrcHandler->{OUTSTR});
		    $TrgHandler->{OUTSTR} = &normalize($TrgHandler->{OUTSTR});
		}
		else{
		    $SrcHandler->{OUTSTR}=~s/^\s+//;  # remove leading spaces
		    $SrcHandler->{OUTSTR}=~s/\s+$//;  # remove final spaces
		    $TrgHandler->{OUTSTR}=~s/^\s+//;
		    $TrgHandler->{OUTSTR}=~s/\s+$//;
		}

# 		my @srcwords=split(/\s+/,$SrcHandler->{OUTSTR});
# 		if (@srcwords!=$SrcHandler->{NRWORDS}){
# 		    die "string has different number of words than counted\n";
# 		}
# 		my @trgwords=split(/\s+/,$TrgHandler->{OUTSTR});
# 		if (@trgwords!=$TrgHandler->{NRWORDS}){
# 		    die "string has different number of words than counted\n";
# 		}

		print $SRCOUT $SrcHandler->{OUTSTR},"\n" if (defined $SRCOUT);
		print $TRGOUT $TrgHandler->{OUTSTR},"\n" if (defined $TRGOUT);

		if ($opt_p){
		    print P "$src\t$trg\n";
		}
	    }
	}
    }
}


## closing tags: linkGrp --> close source and target corpus files

sub AlignTagEnd{
    my ($p,$e)=@_;
    if ($e eq 'linkGrp'){
	return &CloseCorpora();
    }
}



##-------------------------------------------------------------------------
## XML parser handlers for corpus parser (separate for source and target)


## XML opening tags
## - s: starts a new sentence --> store ID
## - w: starts a new word --> factors should be attributes of this tag!

sub XmlTagStart{
    my ($p,$e,%a)=@_;

    if ($e eq 's'){
	$p->{OPENSID} = $a{id};
	delete $p->{CLOSEDSID};
	$p->{NRWORDS}=0;
	return 1;
    }
    if ($e eq 'w'){
	if ($p->{OPENSID} eq $p->{IDS}->[0]){
	    $p->{OPENW} = 1;
	    %{$p->{WATTR}} = %a;
	    $p->{WORD}='';
	    $p->{NRWORDS}++;
	}
    }
}

## strings within tags
## - open w-tag (or option -r = raw, untokenized text)? 
##   --> print string to SRCOUT

sub XmlChar{
    my ($p,$c)=@_;
    if ($p->{OPENW} || $opt_r){
	if ($p->{OPENSID} eq $p->{IDS}->[0]){
#	    $c=~tr/| \n/___/;
	    $p->{WATTR}->{word}.=$c;
	    $p->{WATTR}->{word}.=' ' if ($opt_r);
	}
    }
}

## XML closing tags
## - s: end of sentence --> shift ID-set if necessary
## - w: end of word --> add space in the end (adds extra space after last word)

sub XmlTagEnd{
    my ($p,$e,%a)=@_;

    if ($e eq 's'){
	$p->{CLOSEDSID} = $p->{OPENSID};
	if ($p->{CLOSEDSID} eq $p->{IDS}->[0]){
	    shift(@{$p->{IDS}});
	}
	delete $p->{OPENSID};
	if ($opt_r){
	    $p->{OUTSTR} .= $p->{WATTR}->{word}.' ';
	    $p->{WATTR}->{word} = '';
	}
    }
    elsif ($e eq 'w'){
	if ($p->{OPENSID} eq $p->{IDS}->[0]){
	    $p->{OPENW} = 0;
	    my @factors=();
	    foreach my $f (@{$p->{FACTORS}}){
		$p->{WATTR}->{$f}=~s/^\s+//s;
		$p->{WATTR}->{$f}=~s/\s+$//s;
		$p->{WATTR}->{$f}=~tr/| \n/___/;    # ' ' and '|' not allowed!
		$p->{WATTR}->{$f} = 'UNKNOWN' if ($p->{WATTR}->{$f}!~/\S/);
		if ($f eq 'lem'){
		    if ($p->{WATTR}->{$f}=~/UNKNOWN/){
			$p->{WATTR}->{$f}=$p->{WATTR}->{word};
		    }
		    elsif ($p->{WATTR}->{$f}=~/\@card\@/){
			$p->{WATTR}->{$f}=$p->{WATTR}->{word};
		    }
		}
		if ($opt_l){
		    $p->{WATTR}->{$f}=lc($p->{WATTR}->{$f});
		}
		push(@factors,$p->{WATTR}->{$f});
	    }
	    if (defined $p->{OUT}){
		my $OUT = $p->{OUT};
		$p->{OUTSTR}.=join('|',@factors);  # save string
		$p->{OUTSTR}.=' ';
#		print $OUT join('|',@factors);     # instead of printing
#		print $OUT ' ';                    # directly!
	    }
	}
    }
}



sub normalize {
    $_[0] =~s/\s+/ /gs;  # normalize white space characters
    $_[0] =~s/^\s*//;
    $_[0] =~s/\s*$//;
    $_[0] =~ s/[\x00-\x1f\x7f\n]//gs;             # control characters
    $_[0] =~ s/\<(s|unk|\/s|\s*and\s*|)\>//gs;    # reserved words
    $_[0] =~ s/\[\s*and\s*\]//gs;
    $_[0] =~ s/\|/_/gs;                           # vertical bars
    return $_[0];
}