The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl -w
#-*-perl-*-
#
# this is a sentence aligner based on Phillip Koehn's implementation used
# for aligning the EuroParl corpus
# http://people.csail.mit.edu/koehn/publications/europarl/
# (and his aligner is based on Gale&Church's length-based alignment approach)
#
# USAGE: sentalign [OPTIONS] srcfile trgfile > alignfile
#
# OPTIONS
#
#  -h 'hard-tag-re' .......... regular expression to match hard boundary tags
#  -i min_length ............. check for identical words (set minimal length)
#  -c threshold .............. use LCSR cognate filter (with threshold)
#  -w window ................. define size of sliding window (for all filters)
#  -d dic-file ............... use dictionary filter (using dic-file)
#                              (the dictionary has to be in UTF-8!)
#  -t ........................ use time-slot length instead of char length
#  -u ........................ cognate/identical filter for upper case words
#  -v ........................ verbose mode
#
#
# TODO
#
#  - re-align + dictionary filter?
#  - distance weighted cognate filter
#  - character pair weights for LCSR
#  - add at least a warning if number of hardboundaries is not identical
#

use strict;
use vars qw($opt_h $opt_l $opt_c $opt_w $opt_d $opt_i $opt_v $opt_t $opt_u);
use Getopt::Std;
use XML::Parser;
use Encode;


getopts('h:c:w:l:i:d:vtu');

## set some global variables
## mostly command line arguments

my $VERBOSE = $opt_v;

my $SENT_TAG = 's';
my $WORD_TAG = 'w';

my $HARD_TAG = $opt_h;         # hard tag is a regular expression!

my $USE_IDENTICAL = $opt_i;    # use cognate filter (identical words)
my $USE_COGNATES = $opt_c;     # use cognate filter (lcsr)
my $USE_DICTIONARY = $opt_d;   # use dictionary filter
my $UPPER_CASE = $opt_u;       # cognate filter with upper case words only

my $TIME_LENGTH = $opt_t;      # align based on time slot length

my $MINLENGTH = $opt_l || 7;
my $WINDOW = $opt_w || 5;


## read dictionary if necessary

my %DIC=();
if ($USE_DICTIONARY){
    ReadDictionary(\%DIC,$USE_DICTIONARY);
}


## open source and target files

my $src_file = shift(@ARGV);
my $trg_file = shift(@ARGV);

if ($src_file=~/\.gz$/){
    open SRC,"gzip -cd < $src_file |" || die "cannot open $src_file!\n";
}
else{open SRC,"<$src_file" || die "cannot open $src_file!\n";}
if ($trg_file=~/\.gz$/){
    open TRG,"gzip -cd < $trg_file |" || die "cannot open $trg_file!\n";
}
else{
    open TRG,"<$trg_file" || die "cannot open $trg_file!\n";
}


## create & initialize XML parser instances

my $src_parser = new XML::Parser(Handlers => {Start => \&XmlStart,
					      End => \&XmlEnd,
					      Default => \&XmlChar,
					  },);


my $trg_parser = new XML::Parser(Handlers => {Start => \&XmlStart,
					      End => \&XmlEnd,
					      Default => \&XmlChar,
					  },);

my $src_ph = $src_parser->parse_start;
my $trg_ph = $trg_parser->parse_start;

if ($USE_COGNATES || $USE_IDENTICAL || $USE_DICTIONARY){
    $src_ph->{STORE_WORDS} = 1;  # we want to store the actual words
    $trg_ph->{STORE_WORDS} = 1;  # if we want to apply a filter for
}


##---------------------------------------------------------------------
## now the main part!
##   - print the XML header for sentence alignment file
##   - run through the XML documents (one section section pair at a time)
##   - close XML files and print XML footer

&print_xmlheader($src_file,$trg_file);

my ($src_len,$trg_len,$src_id,$trg_id);
my $LINK_ID=0;

do{
    ($src_len,$trg_len,
     $src_id,$trg_id) = 
	 &GetNextBitextSegment(*SRC,*TRG,$src_ph,$trg_ph);
    if (@{$src_len} or @{$trg_len}){
	my @links = &sentence_align($src_len,$trg_len,$src_id,$trg_id);
	print_alignments(\@links);
    }
}
until ((not @{$src_len}) && (not @{$trg_len}));


close SRC;
close TRG;


&print_xmlfooter();


## end of main part
##---------------------------------------------------------------------


sub print_xmlheader{
    my ($src_file,$trg_file) = @_;

    print '<?xml version="1.0" encoding="utf-8"?>'."\n";
    print '<!DOCTYPE cesAlign PUBLIC "-//CES//DTD XML cesAlign//EN" "">'."\n";
    print '<cesAlign toDoc="'.$trg_file;
    print '" version="1.0" fromDoc="'.$src_file;
    print '">'."\n";
    print ' <linkGrp targType="s" toDoc="'.$trg_file;
    print '" fromDoc="'.$src_file.'">'."\n";
}

sub print_xmlfooter{
    print "</linkGrp>\n";
    print "</cesAlign>\n";
}



##---------------------------------------------------------------------
## read sections from the XML documents


## read one sentence

sub ReadNextSentence{
    my ($FH,$parser) = @_;

    $parser->{SENTENCE_END} = 0;

    while (not $parser->{SENTENCE_END}){
	my $line = <$FH>;                    # read next line
	if (not $line){                      # end of file?
	    $parser->{SENTENCE_END}=1;       # --> stop
	    $parser->{SECTION_END}=1;
	}
	else{
	    $parser->parse_more($line);      # else: parse line
	}
    }
}



## read one section

sub GetNextBitextSegment{
    my ($SFH,$TFH,$src_parser,$trg_parser) = @_;

    my @src_length=();
    my @trg_length=();

    my @src_ids=();
    my @trg_ids=();

    my $last_src = undef;    # idx of last source sentence in section
    my $last_trg = undef;    # idx of last target sentence in section


    $src_parser->{SENT_IDS} = [] unless defined $src_parser->{SENT_IDS};
    $trg_parser->{SENT_IDS} = [] unless defined $trg_parser->{SENT_IDS};

    if (not @{$src_parser->{SENT_IDS}}){          # no more sentences stored?
	$src_parser->{SENT_LENGTHS} = [0];        # start again at position 0
	$src_parser->{SECTION_END} = 0;           # reset end-of-section flag
	if ($TIME_LENGTH){
	    $src_parser->{TIME_LENGTHS} = [0];
	}
    }
    if (not @{$trg_parser->{SENT_IDS}}){
	$trg_parser->{SENT_LENGTHS} = [0];
	$trg_parser->{SECTION_END} = 0;
	if ($TIME_LENGTH){
	    $trg_parser->{TIME_LENGTHS} = [0];
	}
    }

    ## main loop to read sentences from XML files

    while ((not $src_parser->{SECTION_END}) ||
	   (not $trg_parser->{SECTION_END})){

	## we like to have about the same number of sentences in both languages
	## --> read source sentences while there are less than target sents
	## --> read target sentences while there are less than source sents
	## (stop if end-of-section reached)

	if (($#{$src_parser->{SENT_IDS}} < $#{$trg_parser->{SENT_IDS}}) && 
	    (not $src_parser->{SECTION_END})){
	    ReadNextSentence($SFH,$src_parser);
	}
	elsif (($#{$trg_parser->{SENT_IDS}} < $#{$src_parser->{SENT_IDS}}) &&
	       (not $trg_parser->{SECTION_END})){
	    ReadNextSentence($TFH,$trg_parser);
	}
	## in case we reached end-of-section in one of them
	## or we have the same amount of sentences
	## --> read from source and target
	else{
	    if (not $src_parser->{SECTION_END}){
		ReadNextSentence($SFH,$src_parser);
	    }
	    if (not $trg_parser->{SECTION_END}){
		ReadNextSentence($TFH,$trg_parser);
	    }
	}

	## apply word filter to find possible boundaries
	($last_src,$last_trg) = WordFilter($src_parser,$trg_parser);
	if ((defined $last_src) && (defined $last_trg)){last;}

    }

    # if no filter sugestion any new section borders
    # --> take all sentences!

    if (not defined $last_src){
	$last_src = $#{$src_parser->{SENT_IDS}};
    }

    if (not defined $last_trg){
	$last_trg = $#{$trg_parser->{SENT_IDS}};
    }

    # get sentence parameters from section

    if ($last_src>=0){
	foreach (0..$last_src){
	    my $time_length = shift(@{$src_parser->{TIME_LENGTHS}});
	    my $char_length = shift(@{$src_parser->{SENT_LENGTHS}});
	    shift(@{$src_parser->{TIME}});
	    if ($TIME_LENGTH){push(@src_length,$time_length);}
	    else{push(@src_length,$char_length);}
	    push(@src_ids,shift(@{$src_parser->{SENT_IDS}}));
	    if (defined $src_parser->{WORDS}){
		shift(@{$src_parser->{WORDS}});
	    }
	}
	if ($TIME_LENGTH){
	    push(@src_length,$src_parser->{TIME_LENGTHS}->[-1]);;
	}
	else{
	    push(@src_length,$src_parser->{SENT_LENGTHS}->[-1]);;
	}
    }


    if ($last_trg>=0){
	foreach (0..$last_trg){
	    my $time_length = shift(@{$trg_parser->{TIME_LENGTHS}});
	    my $char_length = shift(@{$trg_parser->{SENT_LENGTHS}});
	    shift(@{$trg_parser->{TIME}});
	    if ($TIME_LENGTH){push(@trg_length,$time_length);}
	    else{push(@trg_length,$char_length);}
	    push(@trg_ids,shift(@{$trg_parser->{SENT_IDS}}));
	    if (defined $trg_parser->{WORDS}){
		shift(@{$trg_parser->{WORDS}});
	    }
	}
	if ($TIME_LENGTH){
	    push(@trg_length,$trg_parser->{TIME_LENGTHS}->[-1]);;
	}
	else{
	    push(@trg_length,$trg_parser->{SENT_LENGTHS}->[-1]);;
	}

    }

    return (\@src_length,\@trg_length,\@src_ids,\@trg_ids);

}











##---------------------------------------------------------------------
## WordFilter
##   run with a sliding window through the bitext segment
##   and check if there are matching word pairs
##   in the sentence pairs considered


sub WordFilter{

    my ($src_parser,$trg_parser)=@_;

    my $i = $#{$src_parser->{SENT_IDS}};
    my $j = $#{$trg_parser->{SENT_IDS}};

    if (not exists $src_parser->{WORDS}){return (undef,undef);}
    if (not exists $trg_parser->{WORDS}){return (undef,undef);}

    my $src_sents = $src_parser->{WORDS};
    my $trg_sents = $trg_parser->{WORDS};

    if ($i<0){return ($i,$j);}
    if ($j<0){return ($i,$j);}

    my $w = 0;
    while ($w < $WINDOW){
	if ($i>$j){
	    if ($i-$w >= 0 && $i-$w <= $j){
		if (find_match($$src_sents[$i],$$trg_sents[$i-$w])){
		    print STDERR " in ",$i,' ',$i-$w,"!\n" if $VERBOSE;
		    return($i,$i-$w);
		}
	    }
	    if ($w > 0 && $i+$w <= $j){
		if (find_match($$src_sents[$i],$$trg_sents[$i+$w])){
		    print STDERR " in ",$i,' ',$i+$w,"!\n" if $VERBOSE;
		    return($i-$w,$j);
		}
	    }
	}
	else{
	    if ($j-$w >= 0 && $j-$w <= $i){
		if (find_match($$src_sents[$j-$w],$$trg_sents[$j])){
		    print STDERR " in ",$j-$w,' ',$j,"!\n" if $VERBOSE;
		    return($j-$w,$j);
		}
	    }
	    if ($w > 0 && $j+$w <= $i){
		if (find_match($$src_sents[$j+$w],$$trg_sents[$j])){
		    print STDERR " in ",$j+$w,' ',$j,"!\n" if $VERBOSE;
		    return($j+$w,$j);
		}
	    }
	}
	$w++;
    }
    return (undef,undef);
}



sub find_match{
    if ($USE_IDENTICAL){
	if (identical(@_,$USE_IDENTICAL)){return 1;}
    }
    if ($USE_COGNATES){
	if (cognates(@_,$MINLENGTH,$USE_COGNATES)){return 1;}
    }
    if ($USE_DICTIONARY){
	if (dictionary(@_)){return 1;}
    }
    return 0;
}



sub identical{
    my ($src,$trg,$minlength)=@_;

    ## make lower case version of first word
    ## to avoid problems with the 'only upper case words'
    ## (quite ad-hoc)
    if ($UPPER_CASE){
	$$src[0] = lc($$src[0]);
	$$trg[0] = lc($$trg[0]);
    }

    my %src_words=();
    my %trg_words=();
    foreach (@{$src}){
	$src_words{$_}++;
    }
    foreach (@{$trg}){
	$trg_words{$_}++;
    }

    foreach (keys %src_words){
	if (length($_)<$minlength){next;}
	if ($UPPER_CASE){if ($_!~/^\p{Lu}/){next;}}
	if (defined $trg_words{$_}){
	    print STDERR "found identical string '$_' " if $VERBOSE;
	    return 1;
	}
    }
    return 0;
}



sub cognates{

    my ($src,$trg,$minlength,$minscore)=@_;

    ## make lower case version of first word
    ## to avoid problems with the 'only upper case words'
    ## (quite ad-hoc)
    if ($UPPER_CASE){
	$$src[0] = lc($$src[0]);
	$$trg[0] = lc($$trg[0]);
    }

    my %src_words=();
    my %trg_words=();
    foreach (@{$src}){
	$src_words{$_}++;
    }
    foreach (@{$trg}){
	$trg_words{$_}++;
    }

    foreach my $s (keys %src_words){
	my $s_len = length($s);
	if ($s_len < $minlength){next;}
	if ($UPPER_CASE){if ($s!~/^\p{Lu}/){next;}}
	foreach my $t (keys %trg_words){
	    my $t_len = length($t);
	    if ($t_len < $minlength){next;}
	    if ($UPPER_CASE){if ($t!~/^\p{Lu}/){next;}}
	    if ($s eq $t){
		print STDERR "found cognate '$s' - '$t' " if $VERBOSE;
		return 1;
	    }
	    if ($s_len > $t_len){
		if ($t_len/$s_len < $minscore){next;}
		if (LCS($s,$t)/$s_len > $minscore){
		    print STDERR "found cognate '$s' - '$t' " if $VERBOSE;
		    return 1;
		}
	    }
	    else{
		if ($s_len/$t_len < $minscore){next;}
		if (LCS($s,$t)/$t_len > $minscore){
		    print STDERR "found cognate '$s' - '$t' " if $VERBOSE;
		    return 1;
		}
	    }
	}
    }
    return 0;
}



sub LCS {
    my ($src,$trg)=@_;
    my (@l,$i,$j);
    my @src_let=split(//,$src);		# split string into char
    my @trg_let=split(//,$trg);
    unshift (@src_let,'');
    unshift (@trg_let,'');
  for ($i=0;$i<=$#src_let;$i++){                # initialize the matrix
      $l[$i][0]=0;
  }
  for ($i=0;$i<=$#trg_let;$i++){
      $l[0][$i]=0;
  }                                                       # weight function is

    for $i (1..$#src_let){
	for $j (1..$#trg_let){
	    if ($src_let[$i] eq $trg_let[$j]){
		$l[$i][$j]=$l[$i-1][$j-1]+1;
	    }
	    else{
		if ($l[$i][$j-1]>$l[$i-1][$j]){
		    $l[$i][$j]=$l[$i][$j-1];
		}
		else{
		    $l[$i][$j]=$l[$i-1][$j];
		}
	  }
	}
    }
    return $l[$#src_let][$#trg_let];
}


sub ReadDictionary{
    my ($dic,$file)=@_;
    if (-e $file){
	if ($file=~/\.gz$/){
	    open DIC,"gzip -cd < $file |" || 
		die "cannot open dictionary file $file!\n";
	}
	else{
	    open DIC,"< $file " || die "cannot open dictionary file $file!\n";
	}
	binmode(DIC,":encoding(utf8)");
	while (<DIC>){
	    chomp;
	    my ($src,$trg) = split(/\s/);
	    $$dic{$src}{$trg}++;
	}
    }
}




sub dictionary{
    my ($src,$trg)=@_;

    my %src_words=();
    my %trg_words=();
    foreach (@{$src}){
	$src_words{$_}++;
    }
    foreach (@{$trg}){
	$trg_words{$_}++;
    }

    foreach my $s (keys %src_words){
	if (exists $DIC{$s}){
	    foreach my $t (keys %trg_words){
		if (exists $DIC{$s}{$t}){
		    print STDERR "found in dic '$s' - '$t' " if $VERBOSE;
		    return 1;
		}
	    }
	}
    }

    return 0;
}




##---------------------------------------------------------------------
## handlers for XML parser

sub XmlStart{
    my ($p,$e,%attr) = @_;
    if ($e eq $SENT_TAG){
	$p->{START2SID}->{$p->{SENT_LENGTHS}->[-1]}=$attr{id};
	push(@{$p->{SENT_IDS}},$attr{id});
	push(@{$p->{SENT_LENGTHS}},$p->{SENT_LENGTHS}->[-1]);
	$p->{SENT_OPEN} = 1;
    }
    if ($e eq $WORD_TAG){
	$p->{WORD_OPEN} = 1;
	if ($p->{STORE_WORDS}){
	    my $sidx = $#{$p->{SENT_IDS}};
	    push (@{$p->{WORDS}->[$sidx]},'');
	}
    }

    if ($TIME_LENGTH){
	if ($e eq 'time'){
	    if (ref($p->{TIME}) ne 'ARRAY'){
		$p->{TIME}=[];
		$p->{SEC_PER_CHAR} = 1;   # wild guess ....
	    }
	    my $sidx = $#{$p->{SENT_IDS}};

	    if ($sidx>=0){
		## time at sent start --> easy case: just store as start point
		if ($p->{SENT_LENGTHS}->[-1] == $p->{SENT_LENGTHS}->[-2]){
		    $p->{TIME}->[$sidx]->{start}=time2sec($attr{value});
		    $p->{TIME}->[$sidx]->{first}=$p->{TIME}->[$sidx]->{start};
		    $p->{TIME}->[$sidx]->{first_pos}=$p->{SENT_LENGTHS}->[-1];
		}
		## first time tag in this sentence but not at start position
		elsif ($#{$p->{TIME}} < $sidx){
		    $p->{TIME}->[$sidx]->{first}=time2sec($attr{value});
		    $p->{TIME}->[$sidx]->{first_pos}=$p->{SENT_LENGTHS}->[-1];
		}
		## other time events -> just store as last seen time
		else{
		    $p->{TIME}->[$sidx]->{last}=time2sec($attr{value});
		    $p->{TIME}->[$sidx]->{last_pos}=$p->{SENT_LENGTHS}->[-1];
		    ## update SEC_PER_CHAR estimation
		    my $sec = $p->{TIME}->[$sidx]->{last} - 
			$p->{TIME}->[$sidx]->{first};
		    my $chars = $p->{TIME}->[$sidx]->{last_pos} - 
			$p->{TIME}->[$sidx]->{first_pos};
		    if ($sec>0 && $chars>0){
			$p->{SEC_PER_CHAR} = $sec/$chars;
		    }
		}
	    }
	}
    }
}


sub XmlEnd{
    my $p=shift;
    my $e=shift;
    if ($e eq $SENT_TAG){
	delete $p->{SENT_OPEN};
	$p->{SENTENCE_END} = 1;         # flag to indicate end-of-sentence
    }
    elsif (defined $HARD_TAG){
	if ($e=~/$HARD_TAG/){
	    $p->{SECTION_END} = 1;	
	    print STDERR "hard boundary ($e) found!\n" if $VERBOSE;
	}
    }
    if ($e eq $WORD_TAG){
	delete $p->{WORD_OPEN};
    }

    if ($TIME_LENGTH && ($e eq $SENT_TAG)){
	my $sidx = $#{$p->{SENT_IDS}};
	## hardest case: no time at all found in this sentence	
	if ($#{$p->{TIME}}<$sidx){
	    if ($sidx == 0){
		$p->{TIME}->[$sidx]->{start} = 0;
	    }
	    else{
		## take end of last sentence as start for new one
		$p->{TIME}->[$sidx]->{start} = $p->{TIME}->[$sidx-1]->{end};
	    }
	}
	elsif (not $p->{TIME}->[$sidx]->{start}){
	    my $before = $p->{TIME}->[$sidx]->{first_pos} - 
		$p->{SENT_LENGTHS}->[-2];
	    if (defined $p->{TIME}->[$sidx]->{last}){
		my $secs = $p->{TIME}->[$sidx]->{last} - 
		    $p->{TIME}->[$sidx]->{first};
		my $chars = $p->{TIME}->[$sidx]->{last_pos} - 
		    $p->{TIME}->[$sidx]->{first_pos};
		if ($secs>0 && $chars>0){
		    $p->{TIME}->[$sidx]->{start} = 
			$p->{TIME}->[$sidx]->{first} - 
			$before*$secs/$chars;
		}
		else{
		    $p->{TIME}->[$sidx]->{start} = 
			$p->{TIME}->[$sidx]->{first}-
			$p->{SEC_PER_CHAR}*$before;
		}
	    }
	    else{
		$p->{TIME}->[$sidx]->{start} = 
		    $p->{TIME}->[$sidx]->{first} - $p->{SEC_PER_CHAR}*$before;
	    }
	}
	if (defined $p->{TIME}->[$sidx]->{last}){
	    my $after = $p->{SENT_LENGTHS}->[-1] - 
		$p->{TIME}->[$sidx]->{last_pos};
	    if ($after){
		my $secs = $p->{TIME}->[$sidx]->{last} - 
		    $p->{TIME}->[$sidx]->{first};
		my $chars = $p->{TIME}->[$sidx]->{last_pos} - 
		    $p->{TIME}->[$sidx]->{first_pos};
		if ($secs>0 && $chars>0){
		    $p->{TIME}->[$sidx]->{end} = 
			$p->{TIME}->[$sidx]->{last} + $after*$secs/$chars;
		}
		else{
		    $p->{TIME}->[$sidx]->{end} = 
			$p->{TIME}->[$sidx]->{first}+
			$after*$p->{SEC_PER_CHAR};
		}
	    }
	    else{
		$p->{TIME}->[$sidx]->{end} = $p->{TIME}->[$sidx]->{last};
	    }
	}
	else{
	    my $chars=$p->{SENT_LENGTHS}->[-1]-$p->{SENT_LENGTHS}->[-2];
	    $p->{TIME}->[$sidx]->{end} = 
		$p->{TIME}->[$sidx]->{start} + $chars*$p->{SEC_PER_CHAR};
	}
	my $time = $p->{TIME}->[$sidx]->{end}-$p->{TIME}->[$sidx]->{start};
	if ($time>0){
	    push (@{$p->{TIME_LENGTHS}},$p->{TIME_LENGTHS}->[-1]+$time);
	}
	else{
	    ## don't allow time<=0 --> add small time length ...
	    print STDERR "warning! sentence $sidx is $time seconds long\n";
	    push (@{$p->{TIME_LENGTHS}},$p->{TIME_LENGTHS}->[-1]+0.001);
	}
    }

}


sub XmlChar{
    my $p=shift;
    my $e=shift;

    if ($p->{SENT_OPEN}){
	if ($p->{WORD_OPEN}){
	    $p->{SENT_LENGTHS}->[-1] += length($e);
	}
    }

    if ($p->{STORE_WORDS}){
	if ($p->{WORD_OPEN}){
	    my $sidx = $#{$p->{SENT_IDS}};
	    $p->{WORDS}->[$sidx]->[-1] .= $e;
	}
    }
}


sub time2sec{
    my $time=shift;
    my ($h,$m,$s,$ms)=split(/[^0-9\-]/,$time);
    if (not $h){$h=0;}
    if (not $m){$m=0;}
    if (not $s){$s=0;}
    if (not $ms){$ms=0;}
    my $sec = 3600*$h+60*$m+$s+$ms/1000;
    return $sec;
}




##---------------------------------------------------------------------
## alignment related functions


# print sentence links

sub print_alignments{
    my $LINKS = shift;
    foreach my $i (0..$#{$LINKS}){
	print '<link id="SL'.$LINK_ID.'" xtargets="';
	if (ref($$LINKS[$i]{src}) eq 'ARRAY'){
	    print join(' ',@{$$LINKS[$i]{src}});
	}
	print ';';
	if (ref($$LINKS[$i]{trg}) eq 'ARRAY'){
	    print join(' ',@{$$LINKS[$i]{trg}});
	}
	print "\" />\n";
	$LINK_ID++;
    }
}







# this is a vanilla implementation of church and gale
sub sentence_align {
  my ($LEN1,$LEN2,$IDS1,$IDS2) = @_;

  if (not @{$LEN1}){
      my @LINKS=();
      $LINKS[0]{src} = [];
      foreach (0..$#{$LEN2}-1){
#	  push(@{$LINKS[0]{trg}},$$IDS2{$$LEN2[$_]});
	  push(@{$LINKS[0]{trg}},$$IDS2[$_]);
      }
      return @LINKS;
  }
  if (not @{$LEN2}){
      my @LINKS=();
      $LINKS[0]{trg} = [];
      foreach (0..$#{$LEN1}-1){
#	  push(@{$LINKS[0]{src}},$$IDS1{$$LEN1[$_]});
	  push(@{$LINKS[0]{src}},$$IDS1[$_]);
      }
      return @LINKS;
  }

  # parameters
  my %PRIOR;
#  $PRIOR{1}{1} = 0.89;
#  $PRIOR{1}{0} = 0.01/2;
#  $PRIOR{0}{1} = 0.01/2;
#  $PRIOR{2}{1} = 0.089/2;
#  $PRIOR{1}{2} = 0.089/2;
##  $PRIOR{2}{2} = 0.011;
#  $PRIOR{3}{1} = 0.011/2;
#  $PRIOR{1}{3} = 0.011/2;

# for subtitles:
  $PRIOR{1}{1} = 0.6829; # 896
  $PRIOR{2}{1} = 0.0762; # 100
  $PRIOR{0}{1} = 0.0694; # 91
  $PRIOR{1}{0} = 0.0564; # 74
  $PRIOR{1}{2} = 0.0549; # 72
  $PRIOR{1}{3} = 0.0183; # 24
  $PRIOR{3}{1} = 0.0122; # 16
 

  # dynamic programming
  my (@COST,@BACK);
  $COST[0][0] = 0;
  for(my $i1=0;$i1<=$#{$LEN1};$i1++) {
    for(my $i2=0;$i2<=$#{$LEN2};$i2++) {
      next if $i1 + $i2 == 0;
      $COST[$i1][$i2] = 1e10;
      foreach my $d1 (keys %PRIOR) {
	next if $d1>$i1;
	foreach my $d2 (keys %{$PRIOR{$d1}}) {
	  next if $d2>$i2;
	  my $cost = $COST[$i1-$d1][$i2-$d2] - log($PRIOR{$d1}{$d2}) +  
	    &match($$LEN1[$i1]-$$LEN1[$i1-$d1], $$LEN2[$i2]-$$LEN2[$i2-$d2]);
#	  print "($i1->".($i1-$d1).",$i2->".($i2-$d2).") [".($LEN1[$i1]-$LEN1[$i1-$d1]).",".($LEN2[$i2]-$LEN2[$i2-$d2])."] = $COST[$i1-$d1][$i2-$d2] - ".log($PRIOR{$d1}{$d2})." + ".&match($LEN1[$i1]-$LEN1[$i1-$d1], $LEN2[$i2]-$LEN2[$i2-$d2])." = $cost\n";
	  if ($cost < $COST[$i1][$i2]) {
	    $COST[$i1][$i2] = $cost;
	    @{$BACK[$i1][$i2]} = ($i1-$d1,$i2-$d2);
	  }
	}
      }
#      print $COST[$i1][$i2]."($i1-$BACK[$i1][$i2][0],$i2-$BACK[$i1][$i2][1]) ";
    }
#    print "\n";
  }


  # back tracking
  my %NEXT=();
  my @LINKS=();

  my $i1 = $#{$LEN1};
  my $i2 = $#{$LEN2};

  while($i1>0 || $i2>0) {
#    print "back $i1 $i2\n";
      @{$NEXT{$BACK[$i1][$i2][0]}{$BACK[$i1][$i2][1]}} = ($i1,$i2);
      ($i1,$i2) = ($BACK[$i1][$i2][0],$BACK[$i1][$i2][1]);
  }
  while($i1<$#{$LEN1} || $i2<$#{$LEN2}) {
      push(@LINKS,{});
#     print "fwd $i1 $i2\n";
#      print '<link id="SL'.$LINK_ID.'" xtargets="';
      for(my $i=$i1;$i<$NEXT{$i1}{$i2}[0];$i++) {
#	  my $sid = $$IDS1{$$LEN1[$i]};
	  my $sid = $$IDS1[$i];
	  push(@{$LINKS[-1]{src}},$sid);
#	  print " " unless $i == $i1;
#	  print $$IDS1[$i];
      }
#      print ";";
      for(my $i=$i2;$i<$NEXT{$i1}{$i2}[1];$i++) {
#	  my $sid = $$IDS2{$$LEN2[$i]};
	  my $sid = $$IDS2[$i];
	  push(@{$LINKS[-1]{trg}},$sid);
#	  push(@{$LINKS[-1]{trg}},$$LEN2[$i]);
#	  print " " unless $i == $i2;
#	  print $$IDS2[$i];
      }
#      print "\" />\n";
      ($i1,$i2) = @{$NEXT{$i1}{$i2}};
  }  

  return @LINKS;
}



sub match {
  my ($len1,$len2) = @_;
  my $c = 1;
  my $s2 = 6.8;

  if ($len1==0 && $len2==0) { return 0; }
  my $mean = ($len1 + $len2/$c) / 2;
  my $z = ($c * $len1 - $len2)/sqrt($s2 * $mean);
  if ($z < 0) { $z = -$z; }
  my $pd = 2 * (1 - &pnorm($z));
  if ($pd>0) { return -log($pd); }
  return 25;
}

sub pnorm {
  my ($z) = @_;
  my $t = 1/(1 + 0.2316419 * $z);
  return 1 - 0.3989423 * exp(-$z * $z / 2) *
    ((((1.330274429 * $t 
	- 1.821255978) * $t 
       + 1.781477937) * $t 
      - 0.356563782) * $t
     + 0.319381530) * $t;
}