AlignmentSet.pm - metacpan.org

########################################################################
# Author:  Patrik Lambert (lambert@talp.ucp.es)
# Description: Tools library to manage an Alignment Sets, i.e. a set of 
#              sentences aligned at the word (or phrase) level.
#-----------------------------------------------------------------------
#  Copyright 2004 by Patrik Lambert
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
########################################################################

package Lingua::AlignmentSet;
use 5.005;
use vars qw($VERSION);
use strict;
$VERSION = 1.1;
use Lingua::AlSetLib 1.1;
use Lingua::Alignment 1.1;
use Lingua::WriteLatexFile;
use Lingua::AlignmentEval;

use Dumpvalue;
use IO::File;

my $dumper=new Dumpvalue;
my $true = 1;
my $false = 0;

sub new {
	my ($pkg,$refToFileSets) = @_;
    my $refToLocation = readLocation($refToFileSets->[0][0]);
    my $format =  $refToFileSets->[0][1];
    my $range = $refToFileSets->[0][2];
    my $alSet = {};
   	#default values:
   	if (!defined($format)){$format="TALP"}
   	else {$format = uc $format};
	if (!defined($range)){$range="1-"};
	if ($format eq "BLINKER"){
   		#for future ease we save detailed infos contained in the source sample path
		completeBlinkerLocation($refToLocation);
	}
		
	$alSet->{location}=$refToLocation;
	$alSet->{format}=$format;
	    
	setRange($alSet,$range);
	
    #checking the data:
    if ($format eq "GIZA"){
#    	if ($ambiguity || $confidence){die "GIZA format not compatible with ambiguity or confidence features"}
    } elsif ($format eq "BLINKER"){
    } elsif ($format eq "NAACL"){
    } elsif ($format eq "TALP"){
    } else {
    	die "Unknown format $format. Can't create alignment set object";	
    }
    return bless $alSet,$pkg;    
}

# create a new AlignmentSet that contains the same data of an already existing alignment set (without copying the addresses)
sub copy {
	my $alSet = shift;

	my $cloneLocation={};
	my ($field,$value);
	while (($field,$value)=each (%{$alSet->{location}})){
		$cloneLocation->{$field}=$value;
	}
	return Lingua::AlignmentSet->new([[$cloneLocation,$alSet->{format},$alSet->{firstSentPair}."-".$alSet->{lastSentPair}]]);
}

sub setWordFiles{
	my ($alSet,$sourcePath,$targetPath) = @_;
	
	$alSet->{location}->{source}=$sourcePath;
	$alSet->{location}->{target}=$targetPath;	
}
sub setSourceFile{
	my ($alSet,$sourcePath) = @_;
	
	$alSet->{location}->{source}=$sourcePath;
}
sub setTargetFile{
	my ($alSet,$targetPath) = @_;
	
	$alSet->{location}->{target}=$targetPath;	
}

sub setTargetToSourceFile{
	my ($alSet,$targetToSourcePath) = @_;
	
	$alSet->{location}->{targetToSource}=$targetToSourcePath;
}

sub chFormat {
	my ($alSet,$newLocation,$newFormat,$alignMode)=@_;

	$alSet->convert($newLocation,$newFormat,$alignMode);
}


# Won't work if the sentence files are not specified
sub visualise {
    my ($alSet,$representation,$format,$outputFH,$mark,$alignMode,$maxRows,$maxCols)=@_;
    $representation = lc $representation;
    $format = lc $format;
    if (!defined($outputFH)){$outputFH=*STDOUT}
    if ($representation eq "matrix"){
	if (!defined($mark)){$mark = "cross"}
	if (!defined($maxRows)){$maxRows = 53}	#default maxRows value
	if (!defined($maxCols)){$maxCols = 35}	#default maxRows value
	$format="latex";
    }
    my $latex = Lingua::Latex->new;
    if ($format eq "latex"){
	print $outputFH $latex->startFile;
	print $outputFH $latex->setTabcolsep("0.5mm");
    }
    my $output = "";
    my $inputSentPairNum = $alSet->{firstSentPair};
    my $i;
    my ($al,$alSetChunk);
    my $FH = $alSet->openFiles();
    if (($alSet->{format} ne "GIZA") && (!$FH->{source} || !$FH->{target})){
	die "To use the 'visualise' function, you must specify the sentence (words) files.\n";
    }
    while ($alSetChunk = $alSet->loadChunk($FH,$inputSentPairNum,$alignMode)){	# returns 0 if eof or last sentence pair
	$output = "";
	for ($i=0;$i<@$alSetChunk;$i++){
	    $al = $$alSetChunk[$i];
#			print main::Dumper($al);
	    if ($representation eq "matrix"){
		$output.= "\n$inputSentPairNum\n".$al->displayAsMatrix($latex,$mark,$maxRows,$maxCols);
	    }elsif ($representation eq "enumlinks"){
		$output.= "\n$inputSentPairNum\n".$al->displayAsLinkEnumeration($format,$latex);
	    } #elsif
	}#for
	    print $outputFH $output;
	$inputSentPairNum++;
    }
    if ($format eq "latex"){print $outputFH $latex->endFile};
}

#only work if the text files are given (not only the alignment files).
sub getSize {
	my $alSet = shift;
	my ($file,$factor);
	my $size;
	
	if ($alSet->{format} eq "GIZA"){
		$file = $alSet->{location}->{sourceToTarget};
		$factor = 3;
	}elsif ($alSet->{format} eq "NAACL" || $alSet->{format} eq "BLINKER" || $alSet->{format} eq "TALP"){
		if (!$alSet->{location}->{source}){
			die "One of the functions your are using requires you specify the sentence files (source and target)\n";	
		}
		$file = $alSet->{location}->{source};
		$factor = 1;
	}
	open (FILE,"<$file");
	$size += tr/\n/\n/ while sysread(FILE, $_, 2 ** 16);
	close(FILE);
	$size = $size / $factor;
	return $size;
}

# returns a list (in random order) of lineNumbers
# to sort this list, do:	my @sortedSelection = sort { $a <=> $b; } @selection;

sub chooseSubsets {
	#TO DO: possibility of percentage input for the size
	my	($alSet,$seed,$size) = @_;
	my $alSetSize = $alSet->getSize();
	my $count;
	my @selected=();
	my @notSelected = ();
	my ($ind,$elt);

	for ($count=1;$count<$alSetSize;$count++){
		push @notSelected,$count;	
	}
	srand $seed;
	for ($count=0;$count<$size;$count++){
		$ind = rand @notSelected;
		$elt = $notSelected[$ind];
		splice @notSelected, $ind, 1;
		push @selected,$elt;
	}	
	return \@selected;
}
###################################################################
###   EVALUATION                                                ###
###################################################################

#code adapted from Rada Mihalcea's wa_eval_align.pl, rada@cs.unt.edu
# Evaluation is performed using: 
#  - Standard Precision, Recall, F-measure, separate for S (Sure) and P (Possible) cases
#  - AER measure, defined as 
#    AER = 1 - ( |A & S| + |A & P| ) / ( |A| + |S| )
#    [where A represents the alignment, S and P represent the S (Sure) and P (Possible) gold standard alignments]

sub evaluate {
    my ($submissionAlSet,$answerAlSet,$alignMode,$weighted)=@_;
    if (!defined($weighted)){$weighted=0}
    my ($line,$alignment);
    my ($FH,$alSetChunk,$i,$al,$fhPos);
    my ($inputSentPairNum,$internalSentPairNum,$sentPairNum);
    my ($sureMatch,$possibleMatch,$possibleMatchSure);
    my ($surePrecision,$sureRecall,$possiblePrecision,$possibleRecall,$sureFMeasure,$possibleFMeasure,$AER);
# 1	READ ANSWER AND SUBMISSION FILES
# (in the case of NAACL format file it's more efficient to treat it directly, otherwise load to internal structure)
    # answer file
    my %sureAnswer;
    my %possibleAnswer;
    my $INFINITY = 9999999999;
    $inputSentPairNum = $answerAlSet->{firstSentPair};
    $internalSentPairNum = 1;
    if ( $answerAlSet->{format} eq "NAACL" && $alignMode eq "as-is" && $answerAlSet->{firstSentPair} == 1){
	my $answerFH = IO::File->new("<".$answerAlSet->{location}{sourceToTarget}) or die "Answer alignment file opening error:$!";				
	#go to first sentence pair:
	$fhPos = $answerFH->getpos; 
	while ($answerFH->getline() !~ m/^0*$inputSentPairNum .*/o && !$answerFH->eof()) {
	    $fhPos = $answerFH->getpos; 
	}
	if ($answerFH->eof()){
	    die "First sentence pair of range not found in ".$answerAlSet->{location}{sourceToTarget}; 	
	}
	$answerFH->setpos($fhPos); #if we changed sentences, we read the first line of next sentence=>go back one line
	
	#read file:
	if ($answerAlSet->{lastSentPair} eq "eof"){
	    $inputSentPairNum = $INFINITY;
	}else{
	    $inputSentPairNum = $answerAlSet->{lastSentPair}+1;
	}
	while(!$answerFH->eof() && ( ($line=$answerFH->getline()) !~ m/^0*$inputSentPairNum .*/o )) {
	    chomp $line;
	    $line =~ s/^\s+|\s+$//g;
	    identifySurePossible($line,\%sureAnswer,\%possibleAnswer);
	}	
	$answerFH->close();
    }else{
	$FH = $answerAlSet->openFiles();
	while ($alSetChunk = $answerAlSet->loadChunk($FH,$inputSentPairNum,$alignMode)){	# returns 0 if eof or last sentence pair
	    for ($i=0;$i<@$alSetChunk;$i++){
		$al = $$alSetChunk[$i];
#				print "EVALUATE: answer al:\n";
#				print main::Dumper($al);
		foreach $line (@{$al->writeToBlinker()}){
		    $line = "$internalSentPairNum ".$line;
		    identifySurePossible($line,\%sureAnswer,\%possibleAnswer);	
		}
	    }
	    $inputSentPairNum++;
	    $internalSentPairNum++;
	}
	closeFiles($FH,$answerAlSet->{format});
    }# if format - else
	
    # submission file
    my %sureSubmission;
    my %possibleSubmission;	
    $inputSentPairNum = $submissionAlSet->{firstSentPair};
    $internalSentPairNum = 1;

    if ($submissionAlSet->{format} eq "NAACL" && $alignMode eq "as-is" && $submissionAlSet->{firstSentPair}==1){
	my $submissionFH = IO::File->new("<".$submissionAlSet->{location}{sourceToTarget}) or die "Submission alignment file opening error:$!";				

	#go to first sentence pair:
	$fhPos = $submissionFH->getpos; 
	while ($submissionFH->getline() !~ m/^0*$inputSentPairNum .*/o && !$submissionFH->eof()) {
	    $fhPos = $submissionFH->getpos; 
	}
	if ($submissionFH->eof()){
	    die "First sentence pair of range not found in ".$submissionAlSet->{location}{sourceToTarget}; 	
	}
	$submissionFH->setpos($fhPos); #if we changed sentences, we read the first line of next sentence=>go back one line
	
	#read file:
	if ($submissionAlSet->{lastSentPair} eq "eof"){
	    $inputSentPairNum = $INFINITY;
	}else{
	    $inputSentPairNum = $submissionAlSet->{lastSentPair}+1;
	}
	while(!$submissionFH->eof() && (($line = $submissionFH->getline()) !~ m/^0*$inputSentPairNum .*/o )) {
	    chomp $line;
	    $line =~ s/^\s+|\s+$//g;
	    identifySurePossible($line,\%sureSubmission,\%possibleSubmission);
	}	
	$submissionFH->close();
    }else{
	$FH = $submissionAlSet->openFiles();
	while ($alSetChunk = $submissionAlSet->loadChunk($FH,$inputSentPairNum,$alignMode)){	# returns 0 if eof or last sentence pair
	    for ($i=0;$i<@$alSetChunk;$i++){
		$al = $$alSetChunk[$i];
		# print "submission al:\n";
		# $dumper->dumpValue($al);
		foreach $line (@{$al->writeToBlinker()}){
		    $line = "$internalSentPairNum ".$line;					
		    identifySurePossible($line,\%sureSubmission,\%possibleSubmission);	
		}
	    }
	    $inputSentPairNum++;
	    $internalSentPairNum++;
	}
	closeFiles($FH,$submissionAlSet->{format});
    }# if format=NAACL else
	
	
#	print "weighted:$weighted\n";
#	print "SA:".join("-",keys %sureAnswer),"\nSS:".join(" - ",keys %sureSubmission),"\nPA:".join(" - ",keys %possibleAnswer),"\nPS:".join(" - ",keys %possibleSubmission)."\n";

# 2 WEIGHT LINKS
# It is a kind of "normalization" of multiple links: each link (j i) is weighted according to 
# the number of links in which j and i are involved: weight(j,i)=0.5*(1/numLinks(j)+1/numLinks(i)). 

	my ($link,$j,$hash,$value);
	my %weightsSure;
	my %linksSure;
	my @linksSureInSentence;
	my %linksPossible;
	my @linksPossibleInSentence;
	my %weightsPossible;

	if ($weighted){
	# When only sure links are considered (calculation of Ps and Rs), they are weighted with respect to the union of both sure sets
		# take union
		foreach $hash ( \%sureSubmission, \%sureAnswer ) {
		    while (($link, $value) = each %$hash) {
		    	($sentPairNum,$j,$i)=split(" ",$link);
		        $linksSure{$sentPairNum}{"$j $i"} = $value;
		    }
		}
		# calculate weight of each link
		foreach  $sentPairNum (keys %linksSure){
			@linksSureInSentence =keys %{$linksSure{$sentPairNum}}; 
			foreach $link (@linksSureInSentence){
				($j,$i)=split(" ",$link);
				$weightsSure{"$sentPairNum $link"}=0.5*( 1/grep(/^$j /,@linksSureInSentence)+1/grep(/ $i$/,@linksSureInSentence) );
			}
		}

	# When all links are considered (calculation of Pp and Rp, AER), possible AND sure links are weighted with respect to the union of all sets.
		%linksPossible=%linksSure;
		# add union of possible links
		foreach $hash (\%possibleSubmission, \%possibleAnswer ) {
		    while (($link, $value) = each %$hash) {
		    	($sentPairNum,$j,$i)=split(" ",$link);
		        $linksPossible{$sentPairNum}{"$j $i"} = $value;
		    }
		}
		# calculate weight of each link
		foreach  $sentPairNum (keys %linksPossible){
			@linksPossibleInSentence =keys %{$linksPossible{$sentPairNum}}; 
			foreach $link (@linksPossibleInSentence){
				($j,$i)=split(" ",$link);
				$weightsPossible{"$sentPairNum $link"}=0.5*( 1/grep(/^$j /,@linksPossibleInSentence)+1/grep(/ $i$/,@linksPossibleInSentence) );
			}
		}
	}
	
# 3 SUM UP LINKS
	# in case of weights distinct from 1: sum of %possibleAnswer and %possibleSubmission is always with %weightsPossible.
	# however the sum of %sureAnswer and %sureSubmission is with %weightsSure to calculate Ps and Rs, %weightsPossible for Pp, Rp and AER.
	my ($totalPossibleAnswer,$totalPossibleSubmission)=(0,0);
	my ($totalSureAnswer_weightsSure,$totalSureSubmission_weightsSure,$totalSureAnswer_weightsPossible,$totalSureSubmission_weightsPossible)=(0,0,0,0);
	if ($weighted){
		foreach $link (keys %sureAnswer){
			$totalSureAnswer_weightsSure+=$weightsSure{$link};	
			$totalSureAnswer_weightsPossible+=$weightsPossible{$link};	
		}		
		foreach $link (keys %sureSubmission){
			$totalSureSubmission_weightsSure+=$weightsSure{$link};	
			$totalSureSubmission_weightsPossible+=$weightsPossible{$link};	
		}		
		foreach $link (keys %possibleAnswer){
			$totalPossibleAnswer+=$weightsPossible{$link};	
		}		
		foreach $link (keys %possibleSubmission){
			$totalPossibleSubmission+=$weightsPossible{$link};	
		}		
	}else{ #every link has a weight 1
		$totalSureAnswer_weightsSure=scalar(keys %sureAnswer);
		$totalSureAnswer_weightsPossible= $totalSureAnswer_weightsSure;
		$totalSureSubmission_weightsSure=scalar(keys %sureSubmission);
		$totalSureSubmission_weightsPossible=$totalSureSubmission_weightsSure;
		$totalPossibleAnswer=scalar(keys %possibleAnswer);
		$totalPossibleSubmission=scalar(keys %possibleSubmission);
	}
	
# 4 COUNT MATCHES

#	print "sureSubmission:",join("|",keys %sureSubmission),"\n";
#	print "possibleSubmission:",join("|",keys %possibleSubmission),"\n";
#	print "sureAnswer:",join("|",keys %sureAnswer),"\n";
#	print "possibleAnswer:",join("|",keys %possibleAnswer),"\n";
#   print "\n";
	# now determine the S[ure] matches 
	$sureMatch = 0;
	foreach $alignment (keys %sureSubmission) {
	    if(defined($sureAnswer{$alignment})) {
			if (!$weighted){$sureMatch++}
			else {$sureMatch += $weightsSure{$alignment}}
	    }
	}
	# and the [P]robable matches 
	# these are checked against both S[ure] and P[robable] correct alignments
	$possibleMatch = 0;
	foreach $alignment (keys %possibleSubmission, keys %sureSubmission) {
	    if(defined($sureAnswer{$alignment}) || defined($possibleAnswer{$alignment})) {
			if (!$weighted){$possibleMatch++}
	    	else{$possibleMatch += $weightsPossible{$alignment}}
	    }
	}
	# and also the intersection between all submitted alignments 
	# and the S [Sure] correct alignments -- as needed by AER
	$possibleMatchSure = 0;
	foreach $alignment (keys %possibleSubmission, keys %sureSubmission) {
	    if(defined($sureAnswer{$alignment})) {
	    	if (!$weighted){$possibleMatchSure++}
	    	else{$possibleMatchSure+= $weightsPossible{$alignment}}
	    }
	}
#	print "sureMatch:$sureMatch possibleMatch:$possibleMatch possibleMatchSure:$possibleMatchSure\n";

# 5 COMPUTE EVALUATION MEASURES
	# now determine the precision, recall, and F-measure for [S]ure alignments
	if(scalar(keys %sureSubmission) != 0) {
	    $surePrecision = $sureMatch / $totalSureSubmission_weightsSure;
	}else {
	    $surePrecision = 0;
	}
	if(scalar(keys %sureAnswer) != 0) {
	    $sureRecall = $sureMatch / $totalSureAnswer_weightsSure;
	}else {
	    $sureRecall = 0;
	}
	if($sureRecall != 0 && $surePrecision != 0) {
	    $sureFMeasure = 2 * $sureRecall * $surePrecision / ($sureRecall + $surePrecision);
	}else {
	    $sureFMeasure = 0;
	}


	# and now determine the precision, recall, and F-measure for [P]robable alignments
	if(scalar(keys %sureSubmission) + scalar(keys %possibleSubmission) != 0) {
	    $possiblePrecision = $possibleMatch / ($totalSureSubmission_weightsPossible+$totalPossibleSubmission);
	}else {
	    $possiblePrecision = 0;
	}
	if(scalar(keys %sureAnswer) + scalar(keys %possibleAnswer)!= 0) {
	    $possibleRecall = $possibleMatch / ($totalSureAnswer_weightsPossible+$totalPossibleAnswer);
	}else {
	    $possibleRecall = 0;
	}
	if($possibleRecall != 0 && $possiblePrecision != 0) {
	    $possibleFMeasure = 2 * $possibleRecall * $possiblePrecision / ($possibleRecall + $possiblePrecision);
	}else {
	    $possibleFMeasure = 0;
	}

	# and determine the AER
	if(scalar(keys %sureSubmission) + scalar(keys %possibleSubmission) != 0) {
	    $AER = 1 - ($possibleMatchSure + $possibleMatch) / ($totalSureSubmission_weightsPossible+$totalPossibleSubmission+$totalSureAnswer_weightsPossible);
	}else {
	    $AER = 0;
	}
	return Lingua::AlignmentEval->new($surePrecision,$sureRecall,$sureFMeasure,$possiblePrecision,$possibleRecall,$possibleFMeasure,$AER);
}
###################################################################
###   PROCESSING                                                ###
###################################################################

sub processAlignment{
    my ($alSet,$AlignmentSub,$newLocation,$newFormat,$alignMode)=@_;
    my $newAlSet = $alSet->copy;
    if (ref($AlignmentSub) eq 'ARRAY'){
	if ($AlignmentSub->[0] eq "Lingua::Alignment::eliminateWord"){
	    if (@$AlignmentSub<3){die "Missing parameters for Lingua::Alignment::eliminateWord\n"}
	    else{
		my $side = lc $AlignmentSub->[2];
		if (!$alSet->{location}{$side} || !$newLocation->{$side}){die "Missing $side file for Lingua::Alignment::eliminateWord\n"}
	    }
	}	
    }
    $newAlSet->convert($newLocation,$newFormat,$alignMode,$AlignmentSub);
    return $newAlSet;
}

sub symmetrize {
    my ($alSet,$newLocation,$newFormat,$ENV,$selectSubgroups,$alignMode,$globals)=@_;
    #defaults
    if (!defined($selectSubgroups)){$selectSubgroups=0}	
    if (!defined($alignMode)){$alignMode="no-null-align"}
    if (!defined($globals->{"minPhraseFrequency"})){$globals->{"minPhraseFrequency"}=2};
    if (!defined($globals->{"extendGroups"})){$globals->{"extendGroups"}=0};
    if (!defined($globals->{"onlyGroups"})){$globals->{"onlyGroups"}=1};
    if (!defined($globals->{"defaultActionGrouping"})){$globals->{"defaultActionGrouping"}="Lingua::Alignment::getUnion"};
    if (!defined($globals->{"defaultActionGeneral"})){$globals->{"defaultActionGeneral"}="Lingua::Alignment::intersect"};
    if (!defined($globals->{"verbose"})){$globals->{"verbose"}="0"};
    my $verbose = $globals->{"verbose"};
    my $al; # reference alignment- remains unchanged
    my $modAl; #reference alignment modified with the successive aplication of symRules
    #load in memory a chunk of the alignment set as a list
    #of references to (internal representation) alignment objects:
    my ($k,$alSetChunk);
    my $FH = $alSet->openFiles();
    my $newFH;
    if ($selectSubgroups==0){
	$newFH = openLocation($newLocation,$newFormat,">",$alSet->{location});
    }
    my $internalSentPairNum = 1;	
    my ($sentenceNum,$ruleApplied) = ($alSet->{firstSentPair},0);

    my $grSentPairNum=1;

    my ($j,$i);
    my ($lines,$line);
    my $groups = {};
    my $groupsCurrentSentence = {};
    my $groupKeys = [];
    my $subGroups={};
    my $subGroupsCurrentSentence = {};
    my $subGroupKeys=[];
    my ($candidate,$count);

    if (!$selectSubgroups){	#load subgroup hash and array:
	open(GROUPS,"<$ENV/groups");
	while (<GROUPS>){
	    push @$groupKeys,$_;
	    @$line = split " | ",$_,2;
	    $groups->{$line->[1]}=$line->[0];	
	}
	if ($globals->{onlyGroups}==0){
	    open(SUBGROUPS,"<$ENV/subGroups");
	    while (<SUBGROUPS>){
		push @$subGroupKeys,$_;
		@$line = split " | ",$_,2;
		$subGroups->{$line->[1]}=$line->[0];	
	    }
	}
    }
	
    my %anchors;
    my %sourcePerturbed={};
    my %targetPerturbed={};	# Perturbations must be distinct so we keep track of the already detected "Perturbed" $j's
    my ($perturbation,$perturbationNoMod);
    my ($lastAnchorSource,$lastAnchorTarget,$newAnchorSource,$newAnchorTarget);
    my ($ind,$newPerturbationDetected,$anchorsInTarget);
    my ($countPertubs,$countGrouping,$countOneToMany,$countElse,$countNoGroup)=(0,0,0,0,0);
    while ($alSetChunk = $alSet->loadChunk($FH,$sentenceNum,$alignMode)){	# returns 0 if eof or last sentence pair
	for ($k=0;$k<@$alSetChunk;$k++){
#			print "\nsentence pair $sentenceNum\n";
	    $ruleApplied=0; 
	    $al = $$alSetChunk[$k];
	    if ($verbose >0){
		print $sentenceNum."\n";
		print $al->sourceSentence."\n";
		print $al->targetSentence."\n";
	    }
	    $modAl = $al->clone();
	    ($lastAnchorSource,$lastAnchorTarget)=(0,0);
	    %sourcePerturbed=();
	    %targetPerturbed=();
	    $j = 1;
	    #detect "perturbations" in the anchor diagonal looping only over $j (to have less repeated zones). We can only miss those where $i is aligned only to NULL
	    while ($j<@{$al->{sourceAl}}){
		while ( !$al->isAnchor($j,"source") && $j<(@{$al->{sourceAl}})){
		    $j++;
		}
		if ($j<=@{$al->{sourceAl}}){ 
		    if ($j==@{$al->{sourceAl}}){
			($newAnchorSource,$newAnchorTarget) = ($j,scalar(@{$al->{targetAl}}));
		    }else{
			($newAnchorSource,$newAnchorTarget) = ($j,$al->{sourceAl}[$j][0]);
		    }
		    $newPerturbationDetected=0;
		    if (($newAnchorSource-$lastAnchorSource)!=1 && !$sourcePerturbed{$lastAnchorSource+1}){
			$newPerturbationDetected = 1;	
		    } elsif (($newAnchorTarget-$lastAnchorTarget)!=1 && !$targetPerturbed{$lastAnchorTarget+1}){
			$anchorsInTarget=1;
			for ($i=$lastAnchorTarget+1;$i<$newAnchorTarget;$i++){
			    if (!$al->isAnchor($i,"target")){$anchorsInTarget=0}
			}
			if (!$anchorsInTarget){$newPerturbationDetected=1};
		    }
		    if ( $newPerturbationDetected ){
			$countPertubs++;
#						print "\n($lastAnchorSource,$lastAnchorTarget,$newAnchorSource,$newAnchorTarget)\n";
			
			$perturbation = $al->cut($lastAnchorSource,$lastAnchorTarget,$newAnchorSource,$newAnchorTarget);
			$perturbationNoMod = $al->cut($lastAnchorSource,$lastAnchorTarget,$newAnchorSource,$newAnchorTarget);

			# PRINT PERTURBATION TO FILE
			#if (exists($newFH->{source})){
			#    $newFH->{source}->print("<s snum=$grSentPairNum> ".join(" ",@{$perturbation->{sourceWords}})." </s>\n");
			#}
			#if (exists($newFH->{target})){
			#    $newFH->{target}->print("<s snum=$grSentPairNum> ".join(" ",@{$perturbation->{targetWords}})." </s>\n");
			#}
			#$perturbation->getUnion;
			#$lines = $perturbation->writeToBlinker;
			#foreach $line (@$lines){
			#    $newFH->{sourceToTarget}->print("$grSentPairNum $line\n");
			#}
			#$grSentPairNum++;

			#END PRINT PERTURBATION TO FILE
			if ($selectSubgroups){
			    $perturbation->selectSubgroups($groupsCurrentSentence,$subGroupsCurrentSentence,$globals);
			}else{
			    if (1==0){
#			    if ($ruleApplied=$perturbation->applyOneToMany_2()){	
				$countOneToMany++;
			    }elsif (($ruleApplied=$perturbation->applyGrouping($groupKeys,$subGroupKeys,$globals))>0){
				$countGrouping++;
			    }else{
				my $defaultActionGen = $globals->{defaultActionGeneral};
				$perturbation->$defaultActionGen();
				if ($ruleApplied==-1){
				    $countNoGroup++;
				}else{
				    $perturbation->processNull();
				    $countElse++;	
				}
			    }
			    $perturbation->paste($modAl);
			}
#						print "\ns indices:",join (" ",keys %{$perturbation->{sourceIndices}}),"\n";
#						print "t indices:",join (" ",keys %{$perturbation->{targetIndices}}),"\n";
			foreach $ind (keys %{$perturbation->{sourceIndices}}){
			    if ($ind>0){
				$sourcePerturbed{$ind+$perturbation->{zeroSource}}=1;
			    }
			}					
			foreach $ind (keys %{$perturbation->{targetIndices}}){
			    if ($ind>0){
				$targetPerturbed{$ind+$perturbation->{zeroTarget}}=1;
			    }
			}
#						print "s perturbed:",join (" ",keys %sourcePerturbed),"\n";
#						print "t perturbed:",join (" ",keys %targetPerturbed),"\n";
		    }	#if perturbation 
															
		    $anchors{"$newAnchorSource $newAnchorTarget"}=1;
		    ($lastAnchorSource,$lastAnchorTarget) = ($newAnchorSource,$newAnchorTarget);	
		    $j++;
		}
	    } #while j...
	    if ($newFormat eq "NAACL" && !$selectSubgroups){
		if (exists($newFH->{source})){
		    $newFH->{source}->print("<s snum=$internalSentPairNum> ".join(" ",@{$modAl->{sourceWords}})." </s>\n");
		}
		if (exists($newFH->{target})){
		    $newFH->{target}->print("<s snum=$internalSentPairNum> ".join(" ",@{$modAl->{targetWords}})." </s>\n");
		}
		$al->intersect();
		$lines = $modAl->writeToBlinker;
		foreach $line (@$lines){
		    $newFH->{sourceToTarget}->print("$internalSentPairNum $line\n");
		}
	    }
	    if (($internalSentPairNum % 1000)==0){print STDERR $internalSentPairNum}
	    elsif (($internalSentPairNum % 100)==0){print STDERR "."}
	    $sentenceNum++;
	    $internalSentPairNum++;
	    if ($verbose > 0){print "Candidates:\n";}
	    if ($selectSubgroups){
		foreach $candidate (keys %$groupsCurrentSentence){
		    if ($verbose > 0){print "$candidate\n";}
		    $groups->{$candidate}=$groups->{$candidate}+1;
		}
		%$groupsCurrentSentence=();
		if ($globals->{onlyGroups}==0){
		    foreach $candidate (keys %$subGroupsCurrentSentence){
			$subGroups->{$candidate}=$subGroups->{$candidate}+1;
		    }
		    %$subGroupsCurrentSentence=();
		}
	    }
	}#for k<@alSetChunk
	} #while alsetchunk
    print STDERR "\n";
    if ($selectSubgroups==0){
	closeFiles($newFH,$newFormat);
    }
    closeFiles($FH,$alSet->{format});
    if ($selectSubgroups){
	if ($verbose>0){print "\ngroups:",scalar(keys(%$groups))," - subgroups:",scalar(keys(%$subGroups)),"\n";}
	open(GROUPS, ">$ENV/groups") or die "File opening error:$!";;
	while (($candidate,$count)=each(%$groups)){
#				print "groups $count | $candidate\n";	
	    if ($count >= $globals->{minPhraseFrequency}){
		print GROUPS "$count | $candidate\n";	
	    }
	}
	if ($globals->{onlyGroups}==0){
	    open(SUBGROUPS, ">$ENV/subGroups") or die "File opening error:$!";;
	    while (($candidate,$count)=each(%$subGroups)){
#				print "SUBGROUPS $count | $candidate\n";	
		if ($count >= $globals->{minPhraseFrequency}){
		    print SUBGROUPS "$count | $candidate\n";	
		}
	    }
	}
    }else{
	print STDERR "perturbations:$countPertubs (oneToMany:$countOneToMany grouped:$countGrouping not grouped:$countNoGroup others:$countElse)\n";
	$alSet->{location}=$newLocation;
	$alSet->{format}=$newFormat;
	$alSet->{firstSentPair}=1;
	$alSet->{lastSentPair}="eof";		
    }
}

sub orderAsBilCorpus {
    my ($alSet,$newLocation,$newFormat,$alignMode,$corpSrc,$corpTrg,$verbose)=@_;
    if (!defined($newFormat)){$newFormat="TALP"}
    else {$newFormat = uc $newFormat}
    $newLocation = readLocation($newLocation);
    my $FH = $alSet->openFiles();
    my $newFH = openLocation($newLocation,$newFormat,">",$alSet->{location});
    my $inputSentPairNum=$alSet->{firstSentPair};
    my $internalSentPairNum = 1;	
    if ($verbose >0){
	select STDOUT; $| = 1;         # enable autoflush (desactivate buffering)
    }
    open(CS,"<$corpSrc") || die "$corpSrc file opening error !";
    open(CT,"<$corpTrg") || die "$corpTrg file opening error !";

    # LOAD BILINGUAL CORPUS IN HASH
    my %newcorp;
    my $cntCorp=0;
    while (my $s=<CS>) {
	chomp $s;
	my $t=<CT>;
	chomp $t;
	$newcorp{"$s ||| $t"}=1;
	$cntCorp++;
    }
    if ($verbose > 0){
	print "Number of different sentence pairs in new  corpus:".scalar(keys %newcorp)."\n";
    }
    
    # PARSE AL SET
    my $cntToFind=$cntCorp;
    my %found;
    while (my $alSetChunk = $alSet->loadChunk($FH,$inputSentPairNum,"as-is")){	# returns 0 if eof or last sentence pair
	for (my $i=0;$i<@$alSetChunk;$i++){
	    my $al = $$alSetChunk[$i];
	    my $pair=$al->sourceSentence." ||| ".$al->targetSentence;
	    if ($newcorp{$pair}){
		$found{$pair}=$al;
		$cntToFind--;
	    }
	} #for
	if ($cntToFind==0){
	    if ($verbose>0){print STDERR "Leaving loop of giza file at line:$inputSentPairNum\n";}
	    last;
	}
	$inputSentPairNum++;
	if ($verbose>0){
	    if ($inputSentPairNum % 100000 ==0){print $inputSentPairNum;}
	    if ($inputSentPairNum % 10000 ==0){print ".";}
	}
    } #while	
    if ($verbose>0){print "\n";}
    # REORDER AL SET 
    print "reordering Alignment set...\n";
    seek CS,0,0; #go back to beginning of file
    seek CT,0,0;
    while (my $s=<CS>) {
	chomp $s;
	my $t=<CT>;
	chomp $t;
	if (exists($found{"$s ||| $t"})){
	    my $al= $found{"$s ||| $t"};
	    $al->output($FH,$newFormat,$newFH,$newLocation,$internalSentPairNum);
	}else{
	    die "ERROR: not found sentence pair $internalSentPairNum in Alignment Set\n";
	}
	$internalSentPairNum++;
    }
    closeFiles($newFH,$newFormat);
    closeFiles($FH,$alSet->{format});
    $alSet->updateObject($newFormat,$newLocation,$internalSentPairNum-1);
}

sub adaptToBilCorpus {
    my ($alSet,$newLocation,$newFormat,$alignMode,$corpSrc,$corpTrg,$restrictions,$verbose)=@_;
    my $pdiff=$restrictions->{allowedPercentWordDiff};
    my $mindiff=$restrictions->{minAllowedNumWordDiff};
    my $maxdiff=$restrictions->{maxAllowedNumWordDiff};
    my $nfirst=$restrictions->{numWordsConsideredFirst};
    my $dumper = new Dumpvalue;

    if (!defined($newFormat)){$newFormat="TALP"}
    else {$newFormat = uc $newFormat}
    $newLocation = readLocation($newLocation);
    my $FH = $alSet->openFiles();
    my $newFH = openLocation($newLocation,$newFormat,">",$alSet->{location});
    my $inputSentPairNum=$alSet->{firstSentPair};
    my $internalSentPairNum = 1;	
    select STDOUT; $| = 1;         # enable autoflush (desactivate buffering)
    open(CS,"<$corpSrc") || die "$corpSrc file opening error !";
    open(CT,"<$corpTrg") || die "$corpTrg file opening error !";

    # DETECT ALSET SENTENCES THAT ARE IN THE CORPUS
    my %newcorp;
    while (my $s=<CS>) {
	chomp $s;
	my $t=<CT>;
	chomp $t;
	$newcorp{"$s ||| $t"}=1;
    }
    if ($verbose > 0){
	print "Number of different sentence pairs in new  corpus:".scalar(keys %newcorp)."\n";
    }
    my $count=0;
    while (my $alSetChunk = $alSet->loadChunk($FH,$inputSentPairNum,$alignMode)){
        # print $inputSentPairNum."\n";
	for (my $i=0;$i<@$alSetChunk;$i++){
	    my $al = $$alSetChunk[$i];
	    my $s = $al->sourceSentence;
	    my $t = $al->targetSentence;
	    my @ws=split / /,$s;
	    my @wt=split / /,$t;
	    my ($nums,$numt)=(scalar(@ws),scalar(@wt));
	    if (!$newcorp{"$s ||| $t"}){
		# DETECT CLOSEST SENTENCES IN NEW CORPUS AND MODIFY ALIGNMENT SET
		# calculate values for length test (to, later, skip lcs calculation)
		my $sAllowedDiff = Lingua::AlSetLib::max(Lingua::AlSetLib::min($nums*$pdiff/100,$maxdiff),$mindiff);
		my $tAllowedDiff = Lingua::AlSetLib::max(Lingua::AlSetLib::min($numt*$pdiff/100,$maxdiff),$mindiff);
		my $sMin=$nums-$sAllowedDiff;
		my $sMax=$nums+$sAllowedDiff;
		my $tMin=$numt-$tAllowedDiff;
		my $tMax=$numt+$tAllowedDiff;
		if ($verbose > 0){
		    print $s."\n";
		    print $t."\n";
		    print "ns:$nums nt:$numt allowed length diff: s:$sAllowedDiff t:$tAllowedDiff;\n\n";
		}
		my $bestLcs = 0;
		my @bestSrc;
		my @bestTrg;
		my ($cntGoodLength,$cntPassedFirstLCS,$cnt2)=(0,0,0);
		# parse new corpus
		foreach my $pair (keys %newcorp){
		    my ($cs,$ct)=split / \|\|\| /,$pair;
		    my @wcs = split / /,$cs;
		    my @wct = split / /,$ct;
		    my ($numcs,$numct)=(scalar(@wcs),scalar(@wct));

		    if ($verbose >2){print "ncs:$numcs nct:$numct\n";}
		    if ($verbose >2){print "CORPUS:\n$cs\n$ct\n";}		  
		    #length test:
		    if ($numcs<$sMin || $numcs>$sMax || $numct<$tMin || $numct>$tMax){
			# skip lcs ratio calculation
		    }else{
			$cntGoodLength++;
			#calculate LCS, but first looking at the nfirst first words
			my @fws;
			my @fwt;
			my @fwcs;
			my @fwct;
			for (my $i=0;$i<$nfirst;$i++){
			    push @fws,$ws[$i];
			    push @fwt,$wt[$i];
			    push @fwcs,$wcs[$i];
			    push @fwct,$wct[$i];
			}
			my $sLcs = Lingua::AlSetLib::LCS_ratio(\@fws,\@fwcs);
			my $tLcs = Lingua::AlSetLib::LCS_ratio(\@fwt,\@fwct);
			# first words LCS test:
			if ($sLcs == 0 || $tLcs == 0){
			    # skip lcs ratio calculation
			}else{
			    $cntPassedFirstLCS++;
			    my $sLcs = Lingua::AlSetLib::LCS_ratio(\@ws,\@wcs);
			    my $tLcs = Lingua::AlSetLib::LCS_ratio(\@wt,\@wct);
			    if ($verbose >1){
				print "ncs:$numcs nct:$numct\n";
				print "CORPUS:\n$cs\n$ct\n";
				print "chars src lcsr: $sLcs\t trg lcsr:$tLcs\n";
			    }  
			    my $lcs = $sLcs+$tLcs;
			    if ($lcs > $bestLcs){
				$bestLcs = $lcs;
				@bestSrc = ($cs);
				@bestTrg = ($ct);
			    }elsif($lcs == $bestLcs){
				push @bestSrc,$cs;
				push @bestTrg,$ct;
			    }
			}
		    } # if length test
		    if ($verbose >1){
			if ($cnt2>0 && ($cnt2 % 100000)==0){print "$cnt2";}
			elsif ($cnt2>0 && ($cnt2 % 1000)==0){print ".";}
		    }
		    $cnt2++;
		} # for each sent pair in corpus
		my $uniqBestSrc;
		my $uniqBestTrg;
		my $cntCharLevel=scalar(@bestSrc);
		if ($cntCharLevel>1){ 
		    my $bestLcs = 0;
		    # calculate LCS at character level
		    for (my $i=0;$i<$cntCharLevel;$i++){
			my $cs=$bestSrc[$i];
			my $ct=$bestTrg[$i];
			$cntCharLevel++;
			my @chars = split //,$s;
			my @charcs = split //,$cs;
			my $sLcs = Lingua::AlSetLib::LCS_ratio(\@chars,\@charcs);
			my @chart = split //,$t;
			my @charct = split //,$ct;
			my $tLcs = Lingua::AlSetLib::LCS_ratio(\@chart,\@charct);
			my $lcs = $sLcs+$tLcs;
			if ($lcs > $bestLcs){
			    $bestLcs = $lcs;
			    $uniqBestSrc = $cs;
			    $uniqBestTrg = $ct;
			}
		    }
		}else{
		    $uniqBestSrc = $bestSrc[0];
		    $uniqBestTrg = $bestTrg[0];
		}
		if ($cntGoodLength == 0){
		    print "WARNING: sentence pair $inputSentPairNum not found in corpus\n";
		}else{
		    if ($verbose >0){
			my $numCorp=scalar(keys %newcorp);
			print "Passed length test: $cntGoodLength / $numCorp\n";
			print "Passed first words LCS test: $cntPassedFirstLCS\n";
			print "LCS calculated at character level: "; if ($cntCharLevel==1){print "0"}else{print $cntCharLevel}; print "\n";
			print "\nbest lcsr: $bestLcs, best pair:\n$uniqBestSrc\n$uniqBestTrg\n";
		    }
		    # detect edits to pass from alset sent pair to corpus sent pair
		    my @bs=split / /,$uniqBestSrc;
		    my @diffs = Lingua::AlSetLib::diff( \@ws, \@bs );
		    if ($verbose>2){print $dumper->dumpValue(\@diffs);}

		    # parse output of diff function
		    my @updatedPosi; #array: orig posis -> updated posis
		    my %reversePosi; #hash: updated posis -> orig posis
		    for (my $i=0;$i<=$nums;$i++){
			$updatedPosi[$i]=$i;
			$reversePosi{$i}=$i;
		    }

		    foreach my $hunk (@diffs){
			my @delPosi;
			my @del;
			my @addPosi;
			my @add;
			foreach my $change (@$hunk) {
			    if ($change->[0] eq '-'){
				push @delPosi,$change->[1]+1;
				push @del,$change->[2];
			    }else{
				push @addPosi,$change->[1]+1;
				push @add,$change->[2];
			    }
			}
			# del posis are relative to first array (@ws) => update posis
			# add posis are relative to second array (@bs) => don't update posis
			my $numDel=scalar(@delPosi);
			my $numAdd=scalar(@addPosi);
			if ($numDel==0){ #insertion
			    $al->splice("source",$addPosi[0],0,\@add);
			    print "insert '",join(" ",@add),"' at position { ",$addPosi[0]," }\n";
			    #update updatedPosi array
			    for (my $i=$reversePosi{"$addPosi[0]"};$i<=$nums;$i++){
				$updatedPosi[$i]+=$numAdd;
				$reversePosi{"$updatedPosi[$i]"}=$i;
			    }
			}else{ # substitution or deletion
			    $al->splice("source",$updatedPosi[$delPosi[0]],$numDel,\@add);
			    print "substitute '",join(" ",@del),"' at positions { ",join(" ",@delPosi)," } by '",join(" ",@add),"'\n";
			    #update updatedPosi array
			    for (my $i=$delPosi[0]+$numDel;$i<=$nums;$i++){
				$updatedPosi[$i]+=$numAdd-$numDel;
				$reversePosi{"$updatedPosi[$i]"}=$i;
			    }
			}
		    }
		    #target
		    my @bt=split / /,$uniqBestTrg;
		    my @diffs = Lingua::AlSetLib::diff( \@wt, \@bt );
		    if ($verbose>2){print $dumper->dumpValue(\@diffs);}
		    # parse output of diff function
		    @updatedPosi=();
		    %reversePosi=();
		    for (my $i=0;$i<=$numt;$i++){
			$updatedPosi[$i]=$i;
			$reversePosi{$i}=$i;
		    }
		    foreach my $hunk (@diffs){
			my @delPosi;
			my @del;
			my @add;
			my @addPosi;
			foreach my $change (@$hunk) {
			    if ($change->[0] eq '-'){
				push @delPosi,$change->[1]+1;
				push @del,$change->[2];
			    }else{
				push @addPosi,$change->[1]+1;
				push @add,$change->[2];
			    }
			}
			#update updatedPosi array
			my $numDel=scalar(@delPosi);
			my $numAdd=scalar(@addPosi);
			if ($numDel==0){ #insertion
			    $al->splice("target",$addPosi[0],0,\@add);
			    print "insert '",join(" ",@add),"' at position { ",$addPosi[0]," }\n";
			    #update updatedPosi array
			    for (my $i=$reversePosi{"$addPosi[0]"};$i<=$numt;$i++){
				$updatedPosi[$i]+=$numAdd;
				$reversePosi{"$updatedPosi[$i]"}=$i;
			    }
			}else{ # substitution or deletion
			    $al->splice("target",$updatedPosi[$delPosi[0]],$numDel,\@add);
			    print "substitute '",join(" ",@del),"' at positions { ",join(" ",@delPosi)," } by '",join(" ",@add),"'\n";
			    #update updatedPosi array
			    for (my $i=$delPosi[0]+$numDel;$i<=$numt;$i++){
				$updatedPosi[$i]+=$numAdd-$numDel;
				$reversePosi{"$updatedPosi[$i]"}=$i;
			    }
			}
		    }
		    if ($verbose>0){print "--------------------------------------------------------------\n";}
		}
	    }else{
		# sentence pair is in corpus: don't modify anything
	    }
	    $al->output($FH,$newFormat,$newFH,$newLocation,$internalSentPairNum);
	    $count++;
	}
	$inputSentPairNum++;
	$internalSentPairNum++;
    } #while
    print "$count sentence pairs parsed in alignment set\n";
    closeFiles($newFH,$newFormat);
    closeFiles($FH,$alSet->{format});
    $alSet->updateObject($newFormat,$newLocation,$internalSentPairNum-1);
}

######################################################################
###	PRIVATE SUBS
######################################################################

sub readLocation{
	my $location = shift;
	
	if (!ref($location)){ #if it is a path, put it in a location hash
		$location = {"sourceToTarget"=>$location}
	}
	return $location;	
}

sub setRange {
    my ($alSet,$range) = @_;
	
    my @limits = split /-/, $range;
    my $numLimits = scalar(@limits);
    if ($numLimits == 0 || $numLimits >2){
	die "Invalid Range:$range\n";
    }elsif ($numLimits == 1){
	$limits[1]="";	
    }
    $limits[0] =~ s/^\s+|\s+$//g;
    $limits[1] =~ s/^\s+|\s+$//g;
    if ($limits[0] !~ /\d+/ || $limits[0] == 0){
	$alSet->{firstSentPair}="1";
    }else{
	$alSet->{firstSentPair}=$limits[0];
    }
    if ($limits[1] !~ /\d+/ || $limits[1] == 0){
	$alSet->{lastSentPair}="eof";
    }else{
	$alSet->{lastSentPair}=$limits[1];
    }
}

#for future ease we save detailed infos contained in the source sample path
#input: sourceToTarget dir (not optional), targetToSource dir (if exists), source path (optional) and target path (if necessary)
#output: target (if not specified in input), sampleNum (sample number)
sub completeBlinkerLocation{
	my $refToLocation = shift;
	my ($sourceLang,$targetLang);

	if ($refToLocation->{source}){
		my ($sourceDir,$sourceFileName)=split /\/([^\/]+)$/,$refToLocation->{source};
		if ($sourceFileName =~ /^(EN|FR)\.sample.\d+$/){
			#extract the sample number and target file:
			my ($sourceLang,$nothing,$sampleNum) = split /\./,$sourceFileName;
			$refToLocation->{sampleNum} = $sampleNum;
		}
	}
	if (!$refToLocation->{sampleNum}){
		$refToLocation->{sampleNum} = 1;
	}
}

# open (for read or write) the files contained in a "location" hash (ex. at the {location} key of the alignment set hash)
# if opens for write needs old location hash to check you won't delete the old format files
# returns a ref to a hash containing the filehandle variables (hash with same keys as "location" except for Blinker format)
sub openLocation {
    my ($location,$format,$openMode,$oldLocation) = @_;		#oldLocation: optional parameter
    my %FH;

    if ($openMode eq ">"){
	if ($format eq "BLINKER"){
	    completeBlinkerLocation($location);
	}
	# check that your new files are different to prevent from deleting the old ones
	my %oldFiles = reverse %$oldLocation;
	my ($key,$newFile);
	while (($key, $newFile)=each %$location){
	    if ($oldFiles{$newFile} && $key ne "sampleNum"){
		die "Convert function: you are opening for write one of the old format file: $newFile\n";	
	    }
	}
	#end of check
	
	# create directory structure where to create the file/directory if it doesn't exist, create it 
	my $type;
	if ($format eq "BLINKER"){$type = "dir"}
	else {$type = "file"}
	createDirStructure($location->{sourceToTarget},$type);
	if ($location->{targetToSource}){
	    createDirStructure($location->{targetToSource},$type);
	}
	if ($location->{source}){
	    createDirStructure($location->{source},"file");
	}
	if ($location->{target}){
	    createDirStructure($location->{target},"file");
	}
	#end create directory structure
    }
	
    if ($format eq "GIZA"){
	$FH{sourceToTarget} = IO::File->new($openMode.$location->{sourceToTarget}) or die "GIZA file (".$location->{sourceToTarget}.") opening error:$!";
	if ($location->{targetToSource}){
	    $FH{targetToSource} = IO::File->new($openMode.$location->{targetToSource}) or die "GIZA file (".$location->{targetToSource}.") opening error:$!"; 
	}
    } elsif ($format eq "NAACL" || $format eq "TALP"){
	if ($location->{source}){
	    $FH{source} = IO::File->new($openMode.$location->{source}) or die "Source file (".$location->{source}.")  opening error:$!";
	}
	if ($location->{target}){
	    $FH{target} = IO::File->new($openMode.$location->{target}) or die "Target file (".$location->{target}.")  opening error:$!";
	}
	$FH{sourceToTarget} = IO::File->new($openMode.$location->{sourceToTarget}) or die "Alignment file (".$location->{sourceToTarget}.") opening error:$!";				
	if ($location->{targetToSource}){
	    $FH{targetToSource} = IO::File->new($openMode.$location->{targetToSource}) or  die "Alignment file (".$location->{targetToSource}.") opening error:$!";
	}
    } elsif ($format eq "BLINKER"){
	if ($location->{source}){
	    $FH{source} = IO::File->new($openMode.$location->{source}) or die "BLINKER source file (".$location->{source}.") opening error:$!";
	}
	if ($location->{target}){
	    $FH{target} = IO::File->new($openMode.$location->{target}) or die "BLINKER source file (".$location->{target}.") opening error:$!";
	}
    }
    return (\%FH);
}


# if you want to create a file of path "directory_structure/file", makes "directory_structure" if necessary.
# if you want to create a directory of path "directory_structure", makes it if it doesn't exist
# type is "dir" (if you want to create a directory) or "file" (a file) 
sub createDirStructure {
    my ($path,$type)=@_;
    
    if ($type eq "dir"){
	unless(-e $path && -d _){
	    system('mkdir -p '.$path); 
	}	
    }elsif ($type eq "file"){
	$path =~ s/\/$//;
	if ($path =~ /\//){
	    my ($dir,$file)=split /\/[^\/]+$/,$path;
	    unless (-e $dir){
		system('mkdir -p '.$dir);	
	    }
	}
    }
}

# open files of an alignment set for read and go to first sentence pair
sub openFiles {
    my $alSet = shift;
    my %FH = %{openLocation($alSet->{location},$alSet->{format},"<")};
    my $fhPos;
    my $lineNb;

    # go to first Sentence pair:
    if ($alSet->{format} eq "TALP"){
	for ($lineNb=$alSet->{firstSentPair}-1;$lineNb>0;$lineNb--){ #go to first Sentence pair
	    if ($FH{source}){
		$FH{source}->getline();	
	    }
	    if ($FH{target}){
		$FH{target}->getline();	
	    }
	    if ($FH{sourceToTarget}){
		$FH{sourceToTarget}->getline();	
	    }
	    if ($FH{targetToSource}){
		$FH{targetToSource}->getline();	
	    }
	}
    }elsif ($alSet->{format} eq "GIZA"){
	for ($lineNb=$alSet->{firstSentPair}-1;$lineNb>0;$lineNb--){ #go to first Sentence pair
	    $FH{sourceToTarget}->getline();	
	    $FH{sourceToTarget}->getline();	
	    $FH{sourceToTarget}->getline();	
	    if ($FH{targetToSource}){
		$FH{targetToSource}->getline();	
		$FH{targetToSource}->getline();	
		$FH{targetToSource}->getline();	
	    }
	}
    } elsif ($alSet->{format} eq "NAACL"){
	for ($lineNb=$alSet->{firstSentPair}-1;$lineNb>0;$lineNb--){ 
	    if ($FH{source}){
		$FH{source}->getline();	
	    }
	    if ($FH{target}){
		$FH{target}->getline();	
	    }
	    $fhPos = $FH{sourceToTarget}->getpos;
	    while ($FH{sourceToTarget}->getline() !~ m/^0*$alSet->{firstSentPair} .*/ && !$FH{sourceToTarget}->eof()) {
		$fhPos = $FH{sourceToTarget}->getpos;
	    }
	    if ($FH{sourceToTarget}->eof()){
		die "First sentence pair of range (number ".$alSet->{firstSentPair}.") not found in ".$alSet->{location}{sourceToTarget}; 	
	    }
	    $FH{sourceToTarget}->setpos($fhPos); #if we changed sentences, we read the first line of next sentence=>go back one line
	    if ($FH{targetToSource}){
		$fhPos = $FH{targetToSource}->getpos;
		while ($FH{targetToSource}->getline() !~ m/^0*$alSet->{firstSentPair} .*/ && !$FH{targetToSource}->eof()) {
		    $fhPos = $FH{targetToSource}->getpos;
		}
		if ($FH{targetToSource}->eof()){
		    die "First sentence pair of range (number ".$alSet->{firstSentPair}.") not found in ".$alSet->{location}{targetToSource}; 	
		}
		$FH{targetToSource}->setpos($fhPos);
	    }
	}
    } elsif ($alSet->{format} eq "BLINKER"){
	if ($FH{source}){
	    for ($lineNb=$alSet->{firstSentPair}-1;$lineNb>0;$lineNb--){ 
		$FH{source}->getline();	
	    }
	}
	if ($FH{target}){
	    for ($lineNb=$alSet->{firstSentPair}-1;$lineNb>0;$lineNb--){ 
		$FH{target}->getline();	
	    }
	}
    }
    return (\%FH);
}

# close the files contained in the hash at the {location} key of the alignment set hash
sub closeFiles {
    my ($FH,$format) = @_;
    
    if ($format eq "GIZA"){
	$FH->{sourceToTarget}->close();
	if ($$FH{targetToSource}){
	    $FH->{targetToSource}->close();
	}
    } elsif ($format eq "NAACL" || $format eq "TALP"){
	if ($FH->{source}){
	    $FH->{source}->close();
	}
	if ($FH->{target}){
	    $FH->{target}->close();
	}
	if ($FH->{sourceToTarget}){
	    $FH->{sourceToTarget}->close();
	}
	if ($FH->{targetToSource}){
	    $FH->{targetToSource}->close();
	}
    } elsif ($format eq "BLINKER"){
	if ($FH->{source}){
	    $FH->{source}->close();
	}
	if ($FH->{target}){
	    $FH->{target}->close();		
	}
    }
}

# convert a chunk of alignment set file to an array of references to simple (1 sentence) alignment objects
# returns 0 if the file is at eof
sub loadChunk {
    my ($alSet,$alFH,$sentPairNum,$alignMode) = @_;
    my ($sourceString,$targetString,$alString,$reverseAlString);
    my $st_alignments=[];
    my $ts_alignments=[];
    my $al;
    my $theEnd;
    
    if (!defined($alignMode) || $alignMode =~ /^as.?is$/i){
	$alignMode = "as-is";		
    }elsif ($alignMode =~ /^null.?align$/i){
	$alignMode = "null-align";
    }elsif ($alignMode =~ /^no.?null.?align$/i){
	$alignMode = "no-null-align";
    }else{
	die 'Incorrect alignment mode. Correct modes are "as-is","null-align" or "no-null-align".'."\n";
    }
    if ($alSet->{format} eq "TALP"){
	if ($alSet->{lastSentPair} eq "eof"){
	    $theEnd = $$alFH{sourceToTarget}->eof();
	}else{
	    $theEnd = ($$alFH{sourceToTarget}->eof() || $sentPairNum > $alSet->{lastSentPair});
	}
	if ($theEnd){
	    return 0;	
	}else{
	    if ($alFH->{source}){	
		$sourceString = $alFH->{source}->getline();
	    }
	    if ($alFH->{target}){	
		$targetString = $alFH->{target}->getline();
	    }
	    if ($alFH->{sourceToTarget}){
		$alString = $alFH->{sourceToTarget}->getline();
	    }
	    if ($alFH->{targetToSource}){
		$reverseAlString = $alFH->{targetToSource}->getline();
	    }
	    $al = Lingua::Alignment->new;
	    $al->loadFromTalp($alString,$reverseAlString,$sourceString,$targetString);
	}
    }elsif ($alSet->{format} eq "GIZA"){
	if ($alSet->{lastSentPair} eq "eof"){
	    $theEnd = $$alFH{sourceToTarget}->eof();
	}else{
	    $theEnd = ($$alFH{sourceToTarget}->eof() || $sentPairNum > $alSet->{lastSentPair});
	}
	if ($theEnd){
	    return 0;	
	}else{
	    $$alFH{sourceToTarget}->getline();
	    $targetString = $$alFH{sourceToTarget}->getline();
	    $alString = $$alFH{sourceToTarget}->getline();
	    if ($$alFH{targetToSource}){
		$$alFH{targetToSource}->getline();
		$$alFH{targetToSource}->getline();
		$reverseAlString  = $$alFH{targetToSource}->getline();
	    }
	    $al = Lingua::Alignment->new;
	    $al->loadFromGiza($alString,$targetString,$reverseAlString);		
	}
    } elsif ($alSet->{format} eq "NAACL"){
	my $fhPos;
	if ($alSet->{lastSentPair} eq "eof"){
	    $theEnd = $$alFH{sourceToTarget}->eof();
	}else{
	    $theEnd = ($$alFH{sourceToTarget}->eof() || $sentPairNum > $alSet->{lastSentPair});
	}
	if ($theEnd){
	    return 0;	
	}else{
	    if ($$alFH{source}){
		$sourceString = $$alFH{source}->getline();
		#strip tags and memorize snum:
		$sourceString =~ s/<s snum=\d+>(.*)<\/s>/$1/;
	    }
	    if ($$alFH{target}){
		$targetString = $$alFH{target}->getline();
		#strip tags and memorize snum:
		$targetString =~ s/<s snum=(\d+)>(.*)<\/s>/$2/;
	    }
	    $fhPos = $$alFH{sourceToTarget}->getpos;
	    $alString = $$alFH{sourceToTarget}->getline();
	    my ($num,$theRest)=split " ",$alString,2;
	    if ($num==$sentPairNum){	#skip if there is no link for this sentence pair
		$fhPos = $$alFH{sourceToTarget}->getpos;
		push @$st_alignments,$theRest;
		while ($$alFH{sourceToTarget}->getline() =~ m/^$sentPairNum (.*)$/) {
		    push @$st_alignments,$1;
		    $fhPos = $$alFH{sourceToTarget}->getpos;
		}
	    }
	    $$alFH{sourceToTarget}->setpos($fhPos); #if we changed sentences, we read the first line of next sentence=>go back one line
	    if ($$alFH{targetToSource}){
		$fhPos = $$alFH{targetToSource}->getpos; 
		$alString = $$alFH{targetToSource}->getline();
		my ($num,$theRest)=split " ",$alString,2;
		if ($num==$sentPairNum){	#skip if there is no link for this sentence pair
		    $fhPos = $$alFH{targetToSource}->getpos; 
		    push @$ts_alignments,$theRest;
		    while ($$alFH{targetToSource}->getline() =~ m/^$sentPairNum (.*)$/) {
			push @$ts_alignments,$1;
			$fhPos = $$alFH{targetToSource}->getpos; 
		    }
		}
		$$alFH{targetToSource}->setpos($fhPos); #if we changed sentences, we read the first line of next sentence=>go back one line
	    }
	    $al = Lingua::Alignment->new;
	    $al->loadFromBlinker($st_alignments,$ts_alignments,$sourceString,$targetString);
	}		
    } elsif ($alSet->{format} eq "BLINKER"){		
	if ($alSet->{lastSentPair} eq "eof"){
	    $theEnd = !(-e $alSet->{location}->{sourceToTarget}."/samp".$alSet->{location}->{sampleNum}.".SentPair".($sentPairNum-1));
	}else{
	    $theEnd = !(-e $alSet->{location}->{sourceToTarget}."/samp".$alSet->{location}->{sampleNum}.".SentPair".($sentPairNum-1)) || $sentPairNum > $alSet->{lastSentPair};
	}
	if ($theEnd){	
	    return 0;	
	}else{
	    if ($alFH->{source}){	
		$sourceString = $alFH->{source}->getline();
	    }
	    if ($alFH->{target}){	
		$targetString = $alFH->{target}->getline();
	    }
	    open(AL,"< ".$alSet->{location}->{sourceToTarget}."/samp".$alSet->{location}->{sampleNum}.".SentPair".($sentPairNum-1));
	    @$st_alignments = <AL>;			
	    close(AL);
	    if ($alSet->{location}->{targetToSource}){
		open(AL,"< ".$alSet->{location}->{targetToSource}."/samp".$alSet->{location}->{sampleNum}.".SentPair".($sentPairNum-1));
		@$ts_alignments = <AL>;			
		close(AL);
	    }
	    $al = Lingua::Alignment->new;
	    $al->loadFromBlinker($st_alignments,$ts_alignments,$sourceString,$targetString);		
	}
    } 
    if ($alignMode eq "null-align"){
	$al->forceNullAlign();
    }elsif ($alignMode eq "no-null-align"){
	$al->forceNoNullAlign();
    }
    return [$al];
}


sub updateObject {
    my ($alSet,$newFormat,$newLocation,$lastSentPairNum)=@_;
    $alSet->{location}->{sourceToTarget}=$newLocation->{sourceToTarget};	
    $alSet->{location}->{targetToSource}=$newLocation->{targetToSource};
    if ($newLocation->{source}){
	$alSet->{location}->{source}=$newLocation->{source};
    }else{
	if ($alSet->{firstSentPair} != 1 || $alSet->{format} ne $newFormat){ 
	    # in this case the numeration of the converted alignment file and that of the (not converted) source file will not correspond
	    delete($alSet->{location}->{source});
#			warn "After converting into ",$newLocation->{sourceToTarget},", the numeration of the source words file",
#			" didn't correspond any more to that of the alignment file. So the 'source' entry has been removed from the location hash.";
	}
    }
    if ($newLocation->{target}){
	$alSet->{location}->{target}=$newLocation->{target};
    }else{
	if ($alSet->{firstSentPair} != 1 || $alSet->{format} ne $newFormat){
	    # in this case the numeration of the converted alignment file and that of the (not converted) source file will not correspond
	    delete($alSet->{location}->{target});	
#			warn "After converting into ",$newLocation->{sourceToTarget},", the numeration of the target words file ",
#			"didn't correspond any more to that of the alignment file. So the 'target' entry has been removed from the location hash.";
	}
    }
    $alSet->{format}=$newFormat;
    if ($newFormat eq "BLINKER"){
	$alSet->{location}->{sampleNum}=$newLocation->{sampleNum};	
    }elsif(exists($alSet->{location}->{sampleNum})){
	delete($alSet->{location}->{sampleNum});
    }
    $alSet->{firstSentPair}=1;
    $alSet->{lastSentPair}=$lastSentPairNum;
}

# returns the alignment set, with a unique new file set that has the required location,format and range values.
# TO DO: conversion to Giza++ format
sub convert {
    my ($alSet,$newLocation,$newFormat,$alignMode,$AlignmentSub)=@_;
    if (!defined($newFormat)){$newFormat="TALP"}
    else {$newFormat = uc $newFormat}
    $newLocation = readLocation($newLocation);
    my $FH = $alSet->openFiles();
    my $newFH = openLocation($newLocation,$newFormat,">",$alSet->{location});
    my ($i,$al,$alSetChunk,$line,$lines);
    my $inputSentPairNum=$alSet->{firstSentPair};
    my $internalSentPairNum = 1;	
    while ($alSetChunk = $alSet->loadChunk($FH,$inputSentPairNum,$alignMode)){	# returns 0 if eof or last sentence pair
#	    print $inputSentPairNum."\n";
	for ($i=0;$i<@$alSetChunk;$i++){
	    $al = $$alSetChunk[$i];
	    if (defined($AlignmentSub)){
		#look if $AlignmentSub is a ref to an Array or a subroutine
		if (ref($AlignmentSub) eq "ARRAY"){
		    my ($sub,@params) = @$AlignmentSub;
		    $al->$sub(@params);	
		}else{
		    $al->$AlignmentSub();	
		}
	    }
	    $al->output($FH,$newFormat,$newFH,$newLocation,$internalSentPairNum);
	} #for
	$inputSentPairNum++;
	$internalSentPairNum++;
    } #while	
    closeFiles($newFH,$newFormat);
    closeFiles($FH,$alSet->{format});
    $alSet->updateObject($newFormat,$newLocation,$internalSentPairNum-1);
}


# returns the alignment set, with only manyToMany links (source and target words file don't change)
# at the moment, only works for sourceToTarget alignment (sourceAl)
sub printManyToMany {
    my ($alSet,$newLocation,$newFormat)=@_;
    if (!defined($newFormat)){$newFormat="TALP"}
    else {$newFormat = uc $newFormat}
    $newLocation = readLocation($newLocation);
    my $dumper=new Dumpvalue;
    my $FH = $alSet->openFiles();
    my $newFH = openLocation($newLocation,$newFormat,">",$alSet->{location});
    my ($i,$al,$alSetChunk,$line,$lines);
    my $inputSentPairNum=$alSet->{firstSentPair};
    my $internalSentPairNum = 1;	
    
    while ($alSetChunk = $alSet->loadChunk($FH,$inputSentPairNum,"no-null-align")){	# returns 0 if eof or last sentence pair
	for ($i=0;$i<@$alSetChunk;$i++){
	    $al = $$alSetChunk[$i];
	    my $clone = $al->clone;
	    # look for manyToMany links
	    my $clusters = $al->getAlClusters;
	    $al->{sourceAl}=[];
#	    print $dumper->dumpValue($clusters);
	    for (my $c=0;$c<@$clusters;$c++){
		if ( @{$clusters->[$c]{source}}>1 || @{$clusters->[$c]{target}}>1 ){
		    # this is a many to many alignment
		    foreach my $j (@{$clusters->[$c]{source}}){
			foreach my $k (@{$clusters->[$c]{target}}){
			    if ($clone->isIn("sourceAl",$j,$k)){
				push @{$al->{sourceAl}[$j]},$k;
			    }
			}
		    }
		}
	    }
	    $al->output($FH,$newFormat,$newFH,$newLocation,$internalSentPairNum);
	} #for
	$inputSentPairNum++;
	$internalSentPairNum++;
    } #while	
    closeFiles($newFH,$newFormat);
    closeFiles($FH,$alSet->{format});
    $alSet->updateObject($newFormat,$newLocation,$internalSentPairNum-1);
}


# identifies a link as sure or possible
# input: a Naacl-file line containing the link and refs to sure and possible hashes
# action: add to the relevant hash a key corresponding to this link
sub identifySurePossible{
	my ($line,$sure,$possible)=@_;
	my @components;
	my $alignment;
	
#	print "line:$line\n";
	
	#code adapted from Rada Mihalcea's wa_eval_align.pl, rada@cs.unt.edu
    # get all line components: format should be
    # sentence_no position_L1 position_L2 [S|P] [confidence]
    @components = split /\s+/, $line;
    
    if(scalar(@components) < 3) {
		print STDERR "Incorrect format in answer file\n";
	exit;
    }
    
    $alignment = $components[0]." ".$components[1]." ".$components[2];
    
    # identify the S[ure] alignments
    if( scalar (@components) == 3 || (scalar (@components) == 4 && ($components[3] =~ /^[\d\.]+$/ || $components[3] eq 'S')) ||
	(scalar (@components) == 5 && ($components[3] eq 'S' || $components[4] eq 'S'))) {
		$sure->{$alignment} = 1; 
    }
    
    # identify the P[robable] alignments
    if( (scalar (@components) == 4 && $components[3] eq 'P') || (scalar (@components) == 5 && 
    ($components[3] eq 'P' || $components[4] eq 'P'))) {
		$possible->{$alignment} = 1;
    }
}

1;
__END__
# Below is stub documentation for your module. You'd better edit it!

=head1 NAME

Lingua::AlignmentSet - Tools library to manage an Alignment Sets, i.e. a set of sentences aligned at the word (or phrase) level.

=head1 SYNOPSIS

  use Lingua::AlignmentSet;

  See the synopsis of  method calls in doc/reference.pdf

=head1 ABSTRACT

    This module is a Tools Library to manage an Alignment Set, i.e. a set of sentences aligned at the word (or phrase) level. It provides methods to display the links, to apply a function to each alignment of the set, to evaluate the alignments against a reference, and more. One of the objectives of the module is to allow the user to perform all these operations without bothering with the particular physical format of the Alignment Set. Anyway it also provides format conversion methods.

=head1 DESCRIPTION

See doc/reference.pdf for a description.

=head1 SEE ALSO

The reference file (doc/reference.pdf)

=head1 AUTHOR

Patrick Lambert, E<lt>lambert@lsi.upc.esE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright 2004 by Patrick Lambert

This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)