The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
#--------------------------------------------------------------------------
# -*-perl-*-
#
# Copyright (C) 2004 Jörg Tiedemann  <joerg@stp.ling.uu.se>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# $Id$
#--------------------------------------------------------------------------
# USAGE: optimize [OPTIONS] bitext.ces align.gold [src-lang [trg-lang]]
#
# OPTIONS 
#
# -tb bitext.train.... bitext for training (xces align format)
# -tg train.gold ..... gold standard for training
# -eb bitext.test .... bitext for evaluation (default=same as training)
# -eg test.gold ...... gold standard for evaluation (default=tg)
# -hb bitext.heldout . held-out data for evaluation
# -hg heldout.gold ... gold standard for heldout data
# -l nr .............. learn dynamic clues after each <nr> alignments
# -a ................. always learn dynamic clues (even if improvements
#                      encountered during the previous generation)
# -m nr .............. keep at most <nr> individuals for producing new
#                      offspring
# -r nr .............. start (run) at most <nr> amounts of new 
#                      alignment processes (children)
#
#
#   bitext.ces ..... sentence alignment file a la XCES (used for estimating
#                    clues and asa default bitext if no other ones specified)
#   align.gold ..... default gold standard (for training and evaluation if
#                    no other ones specified)
#   src-lang ....... source language id, e.g. 'sv' (optional)
#   trg-lang ....... source language id, e.g. 'en' (optional)
#
# * heldout data should be distinct from all other data (bitext.ces, 
#   bitext.train and bitext.test)
#
# * train.gold and test.gold should be disjoint (but may be included in the
#   same data set from which the clues are derived (bitext.ces))
#
# * bitext.train and bitext.test should only include sentence pairs for which
#   links are defined in the according gold standards (to save processing time)
#
# * word alignment and evaluation (fitness) will be run for training data
#   alignment and evaluation for testing and heldout data is only done if
#   they are specified (how obvious ...)
#
# * probability of mutation operations is hard-coded !!!!! (see Mutate sub)
#     * add random clue = 0.5
#     * increase clue weight of randomly chosen clue = 0.167
#     * decrease clue weight of randomly chosen clue = 0.167
#     * increase overall score threshold = 0.067
#     * decrease overall score threshold = 0.067
#     * remove random clue = 0.067
# * evaluation results will be stored in a separate sub-directory EVAL 
#   which will be created 
#   (if non-existsing) in the current directory
# * evaluations of the held-out data will be stored in a separate 
#   HELDOUT subdirectory
#   (which will also be created if it does not exist already)
#
#--------------------------------------------------------------------------
# run lots of remote alignments
# config-files: uplug/systems/align/word/test/
#--------------------------------------------------------------------------
# this script
#   * creates word alignment clues
#         - basic clues
#         - giza clues                             (requires GIZA++)
#         - dynamic clues from time to time
#           (each time more than $LearnStep new 
#            scores have been counted)
#   * runs the clue aligner remotely               (requires remote-scripts)
#     (max $RunMax processes at a time)
#   * evaluates alignments using a gold standard
#   * collects scores
#   * modifies clue settings
#         - deletes clues          ($StartWithAllClues=1)
#         - adds clues             ($StartWithAllClues=0)
#         - changes clue weights   (if no other modification is possible or
#                                   $ModifyWeightsOnly=1)
#   * only the 10 best settings are kept and modified 
#     (can be changed: look at $Keepmax)
#
#--------------------------------------------------------------------------


use FindBin qw($Bin);
use strict;
use Cwd;
use File::Basename;

my $StartWithAllClues=0;  # =1 --> start with all available clues
my $ModifyWeightsOnly=0;  # =1 --> modify weights only (don't add/delete clues)
my $RunMax=10;            # max number of remote processes
my $KeepMax=10;           # keep the best KeepMax settings before modfiying
my $ClueWeightSteps=0.01; # steps for changing clue weights
my $LearnStep=100;        # how often to learn new dynamic clues
my $maxNotImpr=25;        # stop after a certain number of clue-learning
                          #     without any improvement
my $DefWeight=0.05;       # default weight for clues
my $DefScoreThr=0.02;     # default score threshold
my $ScoreThrSteps=0.005;  # steps for changing score threshold

my $EvalStep=50;          # evaluation using EvalGold after X alignment runs

#my $LearnStep=50000000;   # (basically don't do it ....)

########################################################################

##
## get the optional arguments
##

my $TrainBitext;
my $TrainGold;
my $EvalBitext;
my $EvalGold;
my $HeldoutBitext;
my $HeldoutGold;
my $Bitext;

my $AlwaysDyn=0;    # =0 --> learn dynamic clues only if no improvements

while ($ARGV[0]=~/^\-/){                          # get some script options
    my $opt=shift(@ARGV);
    if ($opt eq '-tb'){$TrainBitext=shift(@ARGV);}
    elsif ($opt eq '-tg'){$TrainGold=shift(@ARGV);}
    elsif ($opt eq '-eb'){$EvalBitext=shift(@ARGV);}
    elsif ($opt eq '-eg'){$EvalGold=shift(@ARGV);}
    elsif ($opt eq '-hb'){$HeldoutBitext=shift(@ARGV);}
    elsif ($opt eq '-hg'){$HeldoutGold=shift(@ARGV);}
    elsif ($opt eq '-a'){$AlwaysDyn=1;}            # always learn dynamic clues
    elsif ($opt eq '-l'){$LearnStep=shift(@ARGV);} # learn dynamic clues each X align
    elsif ($opt eq '-r'){$RunMax=shift(@ARGV);}    # run max this nr of children
    elsif ($opt eq '-m'){$KeepMax=shift(@ARGV);}   # size of population
}

##
## get required arguments and optional language arguments
##

my $srclang;
my $trglang;
if (@ARGV<2){
    die "\nusage: optimize corpus gold-standard [srclang [trglang]]\n\n";
}
my $Bitext=shift @ARGV;
my $GoldLinks=shift @ARGV;


if (not -e $Bitext){die "cannot find corpus '$Bitext'!\n";}
if (not -e $GoldLinks){die "cannot find gold standard '$GoldLinks'!\n";}
if (@ARGV){$srclang=shift @ARGV;}
if (@ARGV){$trglang=shift @ARGV;}


########################################################################

my $thisdir=getcwd;
if (dirname($GoldLinks)!~/^\//){$GoldLinks="$thisdir/$GoldLinks";}
if (dirname($Bitext)!~/^\//){$Bitext="$thisdir/$Bitext";}

if (not $EvalBitext){$EvalBitext=$Bitext;}
if (not $TrainGold){$TrainGold=$GoldLinks;}
if (not $EvalGold){$EvalGold=$TrainGold;}
if (not $TrainBitext){$TrainBitext=$Bitext;}

########################################################################


my $UplugHome="$Bin/../..";
my $uplug=$UplugHome.'/uplug';
my $LinkConfig='systems/align/word/test/link';
my $align=$uplug.' '.$LinkConfig;
my $eval=$UplugHome.'/bin/evalalign.pl';
my $summarize=$Bin.'/summarize-eval';
my $remote=$ENV{HOME}.'/cvs/remote/remote';

my @gizaclues=('giza-word-prefix',
	       'giza-word-prefix-i',
	       'giza-word',
	       'giza-word-i',
	       'giza-pos',
	       'giza-pos-i',
	       'giza-pos-word',
	       'giza-pos-word-i',
#	       'giza-word-prefix1',
#	       'giza-word-prefix2',
#	       'giza-word-prefix4',
	       'giza-word-suffix',
	       'giza-word-suffix-i'
	      );

my @dynclues=(
#	      'dp',
##	      'dpp',
#	      'dc',
#	      'dx',
	      'dl',    # lex
	      'dlp',   # lex+pos
	      'dpx',   # pos+position
	      'dp3',   # pos-trigram
	      'dp3x',  # pos-trigram+position
#	      'dc3',   # chunk-trigram
#	      'dc3p',  # chunk-trigram+pos
	      'dc3x'   # chunk-trigram+position
	      );

my $dynpar=join (' 1 -',@dynclues);
my $dynamic="$uplug systems/align/word/dynamicclues -$dynpar 1";
my $DynNr;

my %StatAlignClues=(
		'dice'      => 'dice.dbm',
		'sim'       => 'str.dbm',
		'gw'        => 'giza-word.dbm',
		'gwi'       => 'giza-word-i.dbm',
		'gp'        => 'giza-pos.dbm',
		'gpi'       => 'giza-pos-i.dbm',
		'gpw'       => 'giza-pos-word.dbm',
		'gpwi'      => 'giza-pos-word-i.dbm',
		'gwp'       => 'giza-word-prefix.dbm',
		'gwpi'      => 'giza-word-prefix-i.dbm',
#		'gwp1'      => 'giza-word-prefix1.dbm',
#		'gwp2'      => 'giza-word-prefix2.dbm',
#		'gwp4'      => 'giza-word-prefix4.dbm',
		'gws'       => 'giza-word-suffix.dbm',
		'gwsi'       => 'giza-word-suffix-i.dbm'
		);

my %DynAlignClues=(
		'dp'     => 'pos.dbm',
#		'dpp'    => 'pos_coarse.dbm',
		'dc'     => 'chunk.dbm',
		'dx'     => 'position.dbm',
		'dl'     => 'lex.dbm',
		'dlp'    => 'lexpos.dbm',
		'dpx'    => 'posposi.dbm',
		'dp3'    => 'postri.dbm',
		'dp3x'   => 'postriposi.dbm',
		'dc3'    => 'chunktri.dbm',
		'dc3p'   => 'chunktripos.dbm',
		'dc3x'   => 'chunktriposi.dbm'
		);


#########################################################################
########################################################################

my $NrRunning=0;          # process counter
my $NrClueAlign=0;        # total number of alignment attempts

my @Setting;              # the set of settings
my %Running;              # running processes (indexed by filnames)
my %Tried;                # settings tried already (filenames)
my %Scores;               # scores
$Scores{F}={};

my $GoldSize=&GetGoldSize($TrainGold);  # check the size of the gold standard

#########################################################################
#########################################################################

&MakeEstimatedClues();                       # estimate clues
my @AvailableClues=&GetAvailableClues();     # get available clues
&CheckPrevious(\@Setting);                   # check previous aligment runs

if ($StartWithAllClues){
    &RunAllClues(\@Setting,\@AvailableClues);          # run with all clues
    for (2..$RunMax){                                  # make $RunMax copies
	push (@Setting,{});                            # of the base setting
	%{$Setting[-1]{clues}}=%{$Setting[-2]{clues}}; # (to be modified)
    }
}
else{
    if (not @Setting){                               # if Settings is empty:
	&RunSingleClues(\@Setting,\@AvailableClues); # run single clue settings
    }
}

#----------------------------------------------------------------------

#########################################################################
#########################################################################
### this is the main loop ###############################################
#########################################################################
#########################################################################

my $bestScore=0;        # best F-score so far
my $lastDyn;            # last link-setting used for learning clues
my $countNotImpr=0;     # count number loops without any improvement
my $modified=0;         # number of modified settings

my $NrScores=keys %{$Scores{F}};                      # nr results
my $nextDyn=(int($NrScores/$LearnStep)+1)*$LearnStep; # next time for learning clues
my $nextEval=(int($NrScores/$EvalStep)+1)*$EvalStep;  # next time for evaluation

# my $nextDyn=keys %{$Scores{F}};  # next time to learn clues 
# $nextDyn+=int()$LearnStep;            # from the best alignment
# my $nextEval=keys %{$Scores{F}}; # next time for doing an evaluation
# $nextEval+=$EvalStep;            # with the second gold-standard

do {
    &GetBestSettings(\@Setting,$KeepMax);     # get the $KeepMax best results
    $modified=0;
    foreach (0..$#Setting){                 # try to modify existing settings
	if ($StartWithAllClues){
	    if ($ModifyWeightsOnly){
		if (&ModifyClueWeights($Setting[$_])){$modified++;}
	    }
	    else{
		if (&ModifySetting2($Setting[$_])){$modified++;}
	    }
	}
	else{
	    if ($ModifyWeightsOnly){
		if (&ModifyClueWeights($Setting[$_])){$modified++;}
	    }
	    else{
#		if (&ModifySetting($Setting[$_])){$modified++;}
		if (&Mutate($Setting[$_])){$modified++;}
	    }
	}
    }
    if ($modified){                           # if there are any new settings
	&RunAllSettings(\@Setting);           # run them!
    }

    #--------------------------------------------------
    # learn dynamic clues from time to time
    # (if more than $nextDyn scores stored in the result-table)
    #    1) more than $LearnStep scores
    #    2) more than 2*$LearnStep scores
    #    3) more than 3*$LearnStep scores
    #    ....
    my $NrScores=keys %{$Scores{F}};

    if ($NrScores>$nextDyn){          # learn new dynamic clues
	$nextDyn+=$LearnStep;

	print "## top-list after $NrScores alignment runs (next=$nextDyn):\n";
	&PrintScores();
	my ($best)=sort {$Scores{F}{$b} <=> $Scores{F}{$a}} keys %{$Scores{F}};
	my $score=$Scores{F}{$best};                 # the best score so far


##########################################################################
#### old !!!!!!!!!!!!!!!!!!!!!!!!!!!
##########################################################################
#	#-------------------------
#	# if the score has been improved since last check
#
#	if ($score<=$bestScore){                     # not improved
#	    $countNotImpr++;                         # increment counter
#	    if ($countNotImpr>$maxNotImpr){          # that's enough
#		&FinishUp();                         # finish up now!
#	    }
#	}
#	else{
#	    $bestScore=$score;                       # improved? yes!->save it!
#	    if ($best and 
#		(not -e "$best.dyn") and                    # don't do it twice
#		(-e "$best.links.gz")){                     # links exist
##########################################################################
#### old up to here !!!!!!!!!!!!!!!!!!!!!!!!!!!
##########################################################################

	#-----------------------------------------------------------------
	# if the score has not been improved since last check
	#-----------------------------------------------------------------

	if (($score<=$bestScore) or $AlwaysDyn){  # if not improved (or always-learn-flag)
	    #---------------------------
	    # no improvement since
	    # last time we learned clues
	    #---------------------------
	    if ($Scores{F}{$lastDyn}>=$score){
		$countNotImpr++;                         # increment counter
		if ($countNotImpr>$maxNotImpr){          # that's enough
		    &FinishUp();                         # finish up now!
		}
	    }
	    #--------------------------------------
	    # improvement since last clue learning:
	    #     learn clues from the best one!!!
	    #--------------------------------------
	    elsif ($best and (not -e "$best.dyn")){         # not yet done?
		$DynNr++;                                   # increment counter
		$lastDyn=$best;                             # save link setting

		&MakeDynamicClues($best);

		#-------------------------------------
		if ($StartWithAllClues){                    # if StartWithAll:
		    &GetBestSettings(\@Setting,$KeepMax);   # get top results
		    foreach my $s (@Setting){               # add all clues
			&AllClueSetting($s);                #  to each setting
		    }                                       # and ...
		    &RunAllSettings(\@Setting);             #  ... run them!
#		    push (@Setting,{clues=>{}});            #  create a setting
#		    &AllClueSetting($Setting[-1]);          #  with all clues
		}
	    }
	}
	#-----------------------------------------------------------------
	# if the score has been improved since last check: save the score!
	#  (don't do anything else)
	#-----------------------------------------------------------------
	else{
	    $bestScore=$score;
	}
    }
    # -----------------
    sleep 1;
}
until ((not $modified) and (not keys %Running));



#-------------------------------------------------------------
&FinishUp();  # wait until all running processes are finished
#-------------------------------------------------------------

exit;

#########################################################################
#########################################################################
### this is the end #####################################################
#########################################################################
#########################################################################


use File::stat;

sub DynIsRunning{
    my @dynamic=keys %Running;
    @dynamic=grep (/^dyn/,@dynamic);
    foreach (@dynamic){
	if (-e "$_.links"){
	    my $fstat=stat("$_.links");        # check file statistics
	    my $mtime=time-$fstat->mtime;      # time - last-modification-time
	    if ($mtime>600){                   # no modification since 10 min?
		unlink ("$_.links");           # --> remove the file!
		delete $Running{$_};           # --> remove from Running
	    }                                  # otherwise:
	    else{return 1;}                    # yes, still running!
	}
    }
    return 0;
}

#-------------------------
# check previous runs
# read their scores
# save their settings
#-------------------------

sub CheckPrevious{
    my $settings=shift;
    opendir(DIR, '.');
    my @eval = grep { /\.eval/ } readdir(DIR);
    @eval = grep { $_!~/giza/ } @eval;
    closedir DIR;
    foreach my $file (@eval){
	$file=~s/\.eval//;
	if ($file=~/dyn([0-9]+)\_/){    # if there are already some dynamic
	    if ($1>$DynNr){$DynNr=$1;}  # clues: check the DynNr
	}
	push(@{$settings},{});
	&File2Setting($settings->[-1],$file);
	$Tried{$file}=1;
	system "touch $file.ready";
	&CheckScore($settings,$file);
	unlink "$file.ready";
    }
}

#-------------------------

sub FinishUp{
    my $running=scalar keys %Running;
    while ($running){
	$running=0;
	foreach my $r (keys %Running){
	    if (&CheckScore(\@Setting,$r)){
		$NrRunning--;
		delete $Running{$r};
	    }
	    elsif (-e "$r.links"){$running++;}
	}
	sleep 1;
    }
    foreach my $r (keys %Running){
	print "warn: failed to run $r\n!";
    }

    my $NrScores=keys %{$Scores{F}};
    print "## final top-list after $NrScores alignment runs:\n";
    &PrintScores();
}


#########################################################################
###### this is the end ##################################################
#########################################################################
#########################################################################






sub GetBestSettings{
    my $set=shift;
    my $max=shift;

##-----------------------------------------------------------
## wait for all currently running processes to be finished
## (comment this out if fast processes should be preferred!) 
# 
#    while (keys %Running){
	foreach my $r (keys %Running){
	    if (&CheckScore($set,$r)){
		$NrRunning--;
		delete $Running{$r};
	    }
	}
#	sleep 1;
#    }
##-----------------------------------------------------------

    my @best=sort {$Scores{F}{$b} <=> $Scores{F}{$a}} keys %{$Scores{F}};
    if ($#best<$max){return;}
    @{$set}=();
    my $count=0;
    while (@best){
	my $file=shift(@best);
	$count++;                       # count the number of settings
	if ($count>$max){               # keep only the $max best settings!
	    foreach (@best){            # remove all other link files:
		if (-e "$_.links.gz"){unlink "$_.links.gz";}
		if (-e "$_.links"){unlink "$_.links";}
	    }
	    return;
	}
	push(@{$set},{});
	&File2Setting($set->[-1],$file);
    }
}

###########################################################

sub RunSingleClues{
    my ($sets,$clues)=@_;
    while (@{$clues}){                              # create inital settings
	my $clue=shift(@{$clues});                  # foreach available clue
	push (@{$sets},{clues=>{$clue=>$DefWeight}, #   create settings with
			lastchange=>'+'.$clue,      #   single clues
			modified=>1});
    }
    &RunAllSettings($sets);                  # run all clue alignment settings
}

sub RunAllClues{
    my ($sets,$clues)=@_;
    push (@{$sets},{clues=>{}});
    &AllClueSetting($sets->[-1]);
    &RunAllSettings($sets);                  # run all clue alignment settings
}

###########################################################
# RunAllSettings
#
# check evaluation files first
# then run all settings in the settings-array


sub RunAllSettings{
    my $settings=shift;

    print "Running: ";
    print scalar keys %Running;
    print ", Scores: ";
    print scalar keys %{$Scores{F}};
    print ", NrClues: ";
    print scalar @AvailableClues;
    print ", NrClueAlign: $NrClueAlign, NrRunning: $NrRunning";
    #-----------------------------------------------
    do {
	if (keys %Running){
	    foreach my $r (keys %Running){
		if (&CheckScore($settings,$r)){
		    $NrRunning--;
		    delete $Running{$r};
		}
	    }
	}
	sleep 1;
    }
    until ($NrRunning<$RunMax);
    #-----------------------------------------------
#    print "running: ";
#    print scalar keys %Running;
    print " ($NrRunning)\n";
#    print "";


    foreach my $s (@{$settings}){
	if ($s->{modified}){
	    my $file=&Setting2File($s);
	    system "touch $file.ready";
	    if (&CheckScore($settings,$file)){next;}
	    unlink "$file.ready";
	    my $file=&RunClueAligner($s);
#	    $NrRunning++;
#	    $Running{$file}=$s->{lastchange};
	}
    }
    $NrRunning=scalar keys %Running;
}

###########################################################
# take all available clues!!!!

sub AllClueSetting{
    my $s=shift;

    my @AllClues=&GetAvailableClues();           # get all available clues
    @AvailableClues=@AllClues;
    foreach (@AllClues){                         # for all clue types
	if (not defined $s->{clues}->{$_}){      # if not included already:
	    $s->{clues}->{$_}=$DefWeight;        #   set default weight
	}
    }
    my $file=&Setting2File($s);          #   get the filebase
    if ((defined $Scores{F}{$file}) or   #   if tested already or
	(defined $Tried{$file})){        #   if already running:
	delete $s->{clues}->{$_};        #      don't try again
    }
    else{                                # otherwise:
	$s->{modified}=1;                #   give it a shot
	return 1;
    }
    return 0;
}


###########################################################
# modify clue alignment settings
# change clue weights ONLY!!!!!

sub ModifyClueWeights{
    my $s=shift;

    #----------------
    # check without modification
    # (might be ok if new dynamic clues exist!
    #----------------
    my $file=&Setting2File($s);           #   get the filebase
    if ((not defined $Scores{F}{$file})   #   if tested already or
	and (not defined $Tried{$file})){ #   if already running:
	$s->{modified}=1;                 #   give it a shot
	return 1;                         #   (mark as modified)
    }

    my $step=$ClueWeightSteps;
    my @clues=keys %{$s->{clues}};
    my $file;
    do {
	my $idx=int(rand $#clues+1);             # random index number
	my $clue=$clues[$idx];                   # (choose random clue)
	my $sign=-1;                             # decrease
	if (int(rand 2)){$sign=1;}               # or increase
	$s->{clues}->{$clue}+=$sign*$step;       # change the weight
	$s->{clues}->{$clue}=
	    (int(100*$s->{clues}->{$clue}))/100; 
	$file=&Setting2File($s);                 # convert to file base
    }
    until ((not defined $Scores{F}{$file})       # do this until a new setting
	   and (not defined $Tried{$file}));     # is found!

    $s->{modified}=1;                            # try it (mark as modified
    return 1;                                    #         and return)
}

# end of modify setting
###########################################################


###########################################################
# modify clue alignment settings (version 2)
#
# 1) delete available clues
# 2) change clue weights

sub ModifySetting2{
    my $s=shift;

    #----------------
    # check without modification
    # (might be ok if new dynamic clues exist!
    #----------------
    my $file=&Setting2File($s);           #   get the filebase
    if ((not defined $Scores{F}{$file})   #   if tested already or
	and (not defined $Tried{$file})){ #   if already running:
	$s->{modified}=1;                 #   give it a shot
	return 1;                         #   (mark as modified)
    }

    my @AllClues=&GetAvailableClues();           # get all available clues
    @AvailableClues=@AllClues;
##    foreach my $c (@AllClues){                   # for all clue types
    while (@AllClues){                           # as long as there are clues
	my $idx=int(rand $#AllClues+1);          # random index number
	my $c=splice(@AllClues,$idx,1);          # get/remove the clue
	if (defined $s->{clues}->{$c}){          # if included already:
	    my $weight=$s->{clues}->{$c};        #   save the weight
	    delete $s->{clues}->{$c};            #   delete the clue
	    my $file=&Setting2File($s);          #   get the filebase
	    if ((defined $Scores{F}{$file}) or   #   if tested already or
		(defined $Tried{$file})){        #   if already running:
		$s->{clues}->{$c}=$weight;       #   put it back again
	    }
	    else{                                # otherwise:
		$s->{modified}=1;                #   give it a shot
		return 1;                        #   (mark as modified)
	    }
	}
    }
    #-----------------------------
    # deleting clues didn't work
    #   --> try to change weights!
    #-----------------------------

    my $step=$ClueWeightSteps;
    my @clues=keys %{$s->{clues}};
    my $file;
    do {
	my $idx=int(rand $#clues+1);             # random index number
	my $clue=$clues[$idx];                   # (choose random clue)
	my $sign=-1;                             # decrease
	if (int(rand 2)){$sign=1;}               # or increase
	$s->{clues}->{$clue}+=$sign*$step;       # change the weight
	$s->{clues}->{$clue}=
	    (int(100*$s->{clues}->{$clue}))/100; 
	$file=&Setting2File($s);                 # convert to file base
    }
    until ((not defined $Scores{F}{$file})       # do this until a new setting
	   and (not defined $Tried{$file}));     # is found!

    $s->{modified}=1;                            # try it (mark as modified
    return 1;                                    #         and return)
}

# end of modify setting
###########################################################



###########################################################
# modify clue alignment settings
#
# 1) add available clues
# 2) change clue weights

sub ModifySetting{
    my $s=shift;

    #----------------
    # check without modification
    # (might be ok if new dynamic clues exist!
    #----------------
    my $file=&Setting2File($s);           #   get the filebase
    if ((not defined $Scores{F}{$file})   #   if tested already or
	and (not defined $Tried{$file})){ #   if already running:
	$s->{modified}=1;                 #   give it a shot
	return 1;                         #   (mark as modified)
    }

    my @AllClues=&GetAvailableClues();           # get all available clues
    @AvailableClues=@AllClues;
##    foreach my $c (@AllClues){                   # for all clue types
    while (@AllClues){                           # as long as there are clues
	my $idx=int(rand $#AllClues+1);          # random index number
	my $c=splice(@AllClues,$idx,1);          # get/remove the clue
	if (not defined $s->{clues}->{$c}){      # if not included already:
	    $s->{clues}->{$c}=$DefWeight;        #   set default weight
	    my $file=&Setting2File($s);          #   get the filebase
	    if ((defined $Scores{F}{$file}) or   #   if tested already or
		(defined $Tried{$file})){        #   if already running:
		delete $s->{clues}->{$c};        #      don't try again
	    }
	    else{                                # otherwise:
		$s->{modified}=1;                #   give it a shot
		return 1;                        #   (mark as modified)
	    }
	}
    }

    my $step=$ClueWeightSteps;
    my @clues=keys %{$s->{clues}};
    my $file;
    do {
	my $idx=int(rand $#clues+1);             # random index number
	my $clue=$clues[$idx];                   # (choose random clue)
	my $sign=-1;                             # decrease
	if (int(rand 2)){$sign=1;}               # or increase
	$s->{clues}->{$clue}+=$sign*$step;       # change the weight
	$s->{clues}->{$clue}=
	    (int(100*$s->{clues}->{$clue}))/100; 
	$file=&Setting2File($s);                 # convert to file base
    }
    until ((not defined $Scores{F}{$file})       # do this until a new setting
	   and (not defined $Tried{$file}));     # is found!

    $s->{modified}=1;                            # try it (mark as modified
    return 1;                                    #         and return)
}

# end of modify setting
###########################################################







###########################################################
# modify clue alignment settings
#
# 1) add available clues
# 2) change clue weights

sub Mutate{
    my $s=shift;

    #----------------
    # check without modification
    # (might be ok if new dynamic clues exist!
    #----------------
    my $file=&Setting2File($s);           #   get the filebase
    if ((not defined $Scores{F}{$file})   #   if tested already or
	and (not defined $Tried{$file})){ #   if already running:
	$s->{modified}=1;                 #   give it a shot
	return 1;                         #   (mark as modified)
    }


    my @AllClues=&GetAvailableClues();           # get all available clues
    @AvailableClues=@AllClues;

    # 0 --> add clue
    # 1 --> increase weight
    # 2 --> decrease weight
    # 3 --> increase minimal score
    # 4 --> decrease minimal score
    # 5 --> remove clue

    while (1){
	my $operation=int(rand 60);
	if ($operation<30){
	    if (&AddRandomClue($s,@AllClues)){
		return 1;
	    }
	}
	elsif ($operation<40){
	    if (&ChangeRandomClueWeight($s,$ClueWeightSteps)){
		return 1;
	    }
	}
	elsif ($operation<50){
	    if (&ChangeRandomClueWeight($s,0-$ClueWeightSteps)){
		return 1;
	    }
	}
	elsif ($operation<54){
	    if (&ChangeScoreThreshold($s,$ScoreThrSteps)){
		return 1;
	    }
	}
	elsif ($operation<58){
	    if (&ChangeScoreThreshold($s,0-$ScoreThrSteps)){
		return 1;
	    }
	}
	elsif ($operation<60){
	    if (&RemoveRandomClue($s)){
		return 1;
	    }
	}
    }
}

sub AddRandomClue{
    my $s=shift;
    my @AllClues=@_;
    while (@AllClues){
	my $idx=int(rand $#AllClues+1);          # random index number
	my $c=splice(@AllClues,$idx,1);          # get/remove the clue
	if (not defined $s->{clues}->{$c}){      # if not included already:
	    $s->{clues}->{$c}=$DefWeight;        #   set default weight
	    my $file=&Setting2File($s);          #   get the filebase
	    if ((defined $Scores{F}{$file}) or   #   if tested already or
		(defined $Tried{$file})){        #   if already running:
		delete $s->{clues}->{$c};        #      don't try again
	    }
	    else{                                # otherwise:
		$s->{modified}=1;                #   give it a shot
		return 1;                        #   (mark as modified)
	    }
	}
    }
    return 0;
}

sub RemoveRandomClue{
    my $s=shift;
    my @AllClues=keys %{$s->{clues}};
    my $file;

    while ($#AllClues){                      # as long as there are >2 clues:
	my $idx=int(rand $#AllClues+1);      #    choose a clue
	my $c=splice(@AllClues,$idx,1);      #    get the clue name
	my $old=$s->{clues}->{$c};           #    save the old weight
	delete $s->{clues}->{$c};            #    delete the clue
	my $file=&Setting2File($s);          #    get the filebase
	if ((defined $Scores{F}{$file}) or   #    if tested already or
	    (defined $Tried{$file})){        #    if already running:
	    $s->{clues}->{$c}=$old;          #       restore clue
	}
	else{                                #    otherwise:
	    $s->{modified}=1;                #       give it a shot
	    return 1;                        #       (mark as modified)
	}
    }
    return 0;
}


sub ChangeRandomClueWeight{
    my $s=shift;
    my $step=shift;

    my @clues=keys %{$s->{clues}};
    my $file;
#    do {
	my $idx=int(rand $#clues+1);              # random index number
	my $clue=$clues[$idx];                    # (choose random clue)
	my $old=$s->{clues}->{$clue};             # save the old weight
	$s->{clues}->{$clue}+=$step;              # change the weight
	$s->{clues}->{$clue}=
	    (int(100*$s->{clues}->{$clue}))/100; 
	$file=&Setting2File($s);                  # convert to file base
#    }
#    until ((not defined $Scores{F}{$file})       # 
#	   and (not defined $Tried{$file}));
#    $s->{modified}=1;
#    return 1;

    if ((not defined $Scores{F}{$file})       # 
	and (not defined $Tried{$file})){     #
	$s->{modified}=1;
	return 1;
    }
    $s->{clues}->{$clue}=$old;                # restore the old weight
    return 0;
}

sub ChangeScoreThreshold{
    my $s=shift;
    my $step=shift;

    my $file;
#    do{
	if (not defined $s->{score}){
	    $s->{score}=$DefScoreThr;
	}
	else{
	    $s->{score}+=$step;
	}
	if ($s->{score}<=0){
	    delete $s->{score};
	}
	$file=&Setting2File($s);                 # convert to file base
#    }
#    until ((not defined $Scores{F}{$file})       # 
#	   and (not defined $Tried{$file}));
#    $s->{modified}=1;
#    return 1;

    if ((not defined $Scores{F}{$file})       # 
	and (not defined $Tried{$file})){     #
	$s->{modified}=1;
	return 1;
    }
    return 0;
}


# end of modify setting
###########################################################




###########################################################
# check scores:
#
# check if there's an eval-file with some scores
# save scores in the Score-hash
# restart the clue aligner if something went wrong

sub CheckScore{
    my $setting=shift;
    my $file=shift;
    if (-e "$file.ready" && -e "$file.eval"){
#    if (-e "$file.eval"){
	open (E, "$file.eval");
	my @link = <E>;
	close E;

	#-------------------------------------------------------
	# check the size of the evaluation file
	# and compare it to the size of the gold standard
	#   if they are not identical --> something went wrong
	#                             --> re-run the alignment!

	my ($size)=grep /size of gold standard/,@link;
	$size=~s/^.*size of gold standard:\s*([0-9]+)\,.*$/$1/;
	if (($size<$GoldSize) or ($size>$GoldSize+5)){
	    if (-e "$file.links.gz"){unlink "$file.links.gz";}
	    if (-e "$file.links"){unlink "$file.links";}
	    unlink "$file.eval";                         # delete old eval-file
	    unlink "$file.ready";                        # delete ready-flag
	    delete $Running{$file};                      # delete from running
	    delete $Tried{$file};
	    delete $Scores{F}{$file};                    # delete scores
#	    sleep 1;                               # JT 16-11-2004:
#	    push (@{$setting},{});                 # do not restart alignments
#	    &File2Setting($setting->[-1],$file);   # in case of corrupted
#	    $setting->[-1]->{modified}=1;          # evaluation files (it may
#	    &RunClueAligner($setting->[-1]);       # cause conflicts!!!!)
	    return 0;
	}
	#-------------------------------------------------------

	if ($link[-1]=~/F:\s+([0-9]+\.[0-9]+)\%/){       # save the F-score
	    $Scores{F}{$file}=$1;
	    if ($link[-2]=~/precision:\s+([0-9]+\.[0-9]+)\%/){   # precision
		$Scores{P}{$file}=$1;
	    }
	    if ($link[-3]=~/recall:\s+([0-9]+\.[0-9]+)\%/){   # precision
		$Scores{R}{$file}=$1;
	    }
	    if (-e "$file.log"){
		open (L, "$file.log");
		my ($time) = grep {/processing time:/} <L>;
		close L;
		if ($time=~/processing time:\s*([0-9].*)$/){
		    $time=$1;
		    while ($time=~s/^\s*0:\s*//){};
		    $Scores{time}{$file}=$time;
		}
		unlink "$file.log";
	    }
	    unlink "$file.ready";
	    return 1;
	}
    }
    return 0;
}

# end of CheckScores
###########################################################



###########################################################
# PrintScore: print the current score table

sub PrintScores{
    if (not keys %Scores){return;}
    print "----------------------------------------------------\n";
    printf "%5s  %5s  %5s %8s %s\n",'P','R','F','time','name';
    print "----------------------------------------------------\n";
    foreach (sort {$Scores{F}{$b} <=> $Scores{F}{$a}} keys %{$Scores{F}}){
	printf "%3.2f  %3.2f  %3.2f %8s %s\n",
	$Scores{P}{$_},$Scores{R}{$_},$Scores{F}{$_},$Scores{time}{$_},$_;
    }
    print "----------------------------------------------------\n";
}


###########################################################
# GetGoldSize: get the size of the gold standard

sub GetGoldSize{
    my $file=shift;
    open (G, $file);
    my @links = grep {/\<(wordLink|link)\s/} <G>;
    close G;
    my %uniq=();                   # there are some copies in some
    map ($uniq{$_}=1,@links);      # gold standard files --> count only
    return scalar keys %uniq;      # unique links!
#    return scalar @links;
}


###########################################################
# GetAvailableClues: get all available clues
#                    (check if dbm-files exist)

sub GetAvailableClues{
    my @avail=();
    foreach (keys %StatAlignClues){
	if (-e "data/runtime/$StatAlignClues{$_}"){
	    push (@avail,$_);
	}
    }
    foreach (keys %DynAlignClues){
	if (-e "data/runtime/$DynAlignClues{$_}"){
	    push (@avail,$_);
	}
    }
    if (defined $srclang && defined $trglang){
	if (open (C, "$UplugHome/$LinkConfig")){
	    my @config=<C>;
	    close C;
	    if (grep {/\'${srclang}${trglang}p\'/} @config){
		push (@avail,"${srclang}${trglang}p");
	    }
	    if (grep {/\'${srclang}${trglang}pp\'/} @config){
		push (@avail,"${srclang}${trglang}pp");
	    }
	    if (grep {/\'${srclang}${trglang}c\'/} @config){
		push (@avail,"${srclang}${trglang}c");
	    }
	}
    }
    return @avail;
}


########################################################################
########################################################################
########################################################################


sub MakeLinkSubDir{
    use File::Basename;
    my $links=shift;
    my $dir=$links;
#    $dir=~s/\.links.gz//;
#    $dir=uc($dir);
    mkdir $dir,0755;
    my ($name,$path)=fileparse($links);
    link $links,"$dir/$name";
    return $dir;
}


#-------------------
# MakeClueLinks: link non-existing clues to data/runtime 
#                in a given sub-directory

sub MakeClueLinks{
    my $dir=shift;
    opendir(DIR, 'data/runtime');                  # make links to clues
    my @dbm = grep { /\.dbm/ } readdir(DIR);
    closedir DIR;
    mkdir "$dir/data",0755;
    mkdir "$dir/data/runtime",0755;
    foreach (@dbm){
	if (not -e "$dir/data/runtime/$_"){
	    link "data/runtime/$_","$dir/data/runtime/$_";
	}
    }
}






###########################################################
# RunClueAligner
#
#-------------------------------------------------------
# run clue aligner 
#    settings: a set of clues with weights
#-------------------------------------------------------

sub RunClueAligner{

    my $setting=shift;
    if (ref($setting->{clues}) ne 'HASH'){return;}

    my @clues=keys %{$setting->{clues}};
    if (not @clues){return;}
    my @weights=values %{$setting->{clues}};
    my $minscore=$setting->{score};

    my $param='';
    foreach (keys %{$setting->{clues}}){
#	$param.='-'.$_.' ';
	if ($setting->{clues}->{$_}){
	    $param.='-'.$_.' ';
	    $param.='-'.$_.'_w '.$setting->{clues}->{$_}.' ';
	}
    }
    if ($minscore){$param.=" -min $minscore";}

    my $file=&Setting2File($setting);

    my $comm="nice $align -in $TrainBitext $param -out $file.links 2>$file.log";
    $comm.=";nice gzip -f $file.links";
#    $comm.=";nice gzip -f $file.log";
    $comm.=";nice $eval -gold $TrainGold -in $file.links.gz >$file.eval";
    if (-e $HeldoutBitext){
	if ($HeldoutBitext and (not -e 'HELDOUT')){mkdir 'HELDOUT',0755;}
	$comm.=";nice $align -in $HeldoutBitext $param -out HELDOUT/$file.links 2>HELDOUT/$file.log";
	$comm.=";nice gzip -f HELDOUT/$file.links";
	$comm.=";nice gzip -f HELDOUT/$file.log";
	if (-e $HeldoutGold){
	    $comm.=";nice $eval -gold $HeldoutGold -in HELDOUT/$file.links.gz >HELDOUT/$file.eval";
	    $comm.=";nice rm HELDOUT/$file.links.gz";
	}
    }
    if (-e $EvalBitext){
	if ($EvalBitext and (not -e 'EVAL')){mkdir 'EVAL',0755;}
	$comm.=";nice $align -in $EvalBitext $param -out EVAL/$file.links 2>EVAL/$file.log";
	$comm.=";nice gzip -f EVAL/$file.links";
	$comm.=";nice gzip -f EVAL/$file.log";
	if (-e $EvalGold){
	    $comm.=";nice $eval -gold $EvalGold -in EVAL/$file.links.gz >EVAL/$file.eval";
	    $comm.=";nice rm EVAL/$file.links.gz";
	}
    }
    $comm.=";touch $file.ready";

    $setting->{modified}=0;

    if ((-e "$file.links") or
	(-e "$file.eval") or
	(-e "$file.links.gz")){
	print "warning: clue alignment '$file' already in progress!!!!\n";
	return undef;
    }
    if (defined $Running{$file}){
#	print "warning: clue alignment '$file' already running!!!!\n";
	$Running{$file}=1;
	return undef;
    }
    if (defined $Tried{$file}){
        print "warning: clue alignment '$file' already tried earlier!!!!\n";
	$Tried{$file}=1;
        return undef;
    }
    if (defined $Scores{F}{$file}){
	print "warning: there is a score for clue alignment '$file'!!!!\n";
	$Tried{$file}=1;
	return undef;
    }
    $Running{$file}=1;
    $Tried{$file}=1;
    $NrClueAlign++;
#    print "run $file\n";
    system ("$remote '$comm'");
    return $file;
}

# end of RunClueAligner
###########################################################



###########################################################
# make a unique filename for a clue aligner setting

sub Setting2File{

    my $setting=shift;
    if (ref($setting->{clues}) ne 'HASH'){return undef;}
    my $file='C';         # start file names with 'C' (to avoid initial '-') 
    my $InclDynClue=0;
    foreach (sort keys %{$setting->{clues}}){
	if (defined $setting->{clues}->{$_}){          # weights before clues!
	    $file.=$setting->{clues}->{$_};
	}
	$file.=$_;                                     # clue name
	if (defined $DynAlignClues{$_}){$InclDynClue=1;}
	$file.='+';
    }
    chop ($file);
    if (not $DynNr){$DynNr=1;}
    if ($InclDynClue){$file='dyn'.$DynNr.'__'.$file;}  # add dynclue-marker
    my $minscore=$setting->{score};
    if ($minscore){$file.="_min$minscore";}
    return $file;
}


###########################################################
# create the clue aligner setting from a uniqe filename

sub File2Setting{

    my $setting=shift;
    my $file=shift;

    $file=~s/^.*?\_\_//;            # remove dynclue-marker
    $file=~s/^C//;                  # remove initial 'C'
    if ($file=~s/(\_min)(.*)$//){   # check if there's a score threshold
	$setting->{score}=$2;
    }
    my @clues=split(/\+/,$file);    # split into clues
    foreach (@clues){
	if (/^([0-9\.]*)([^0-9].*)$/){
	    $setting->{clues}->{$2}=$1;
	}
	else{
	    $setting->{clues}->{$_}=$DefWeight;
	}
    }
}


###########################################################
# MakeEstimatedClues
#
#------------------------------------------------
# create giza clues and basic clues
# and wait until they are finished
#------------------------------------------------


sub MakeEstimatedClues{

#------------------------------------------------
# run giza remotely

    print "create estimated clues!\n";
    foreach my $c (@gizaclues){
	if (-e "$c.eval"){
	    next;
	}
	my $comm="$uplug systems/align/word/test/$c -in $Bitext -out $c.links";
	$comm.=";gzip -f $c.links;$eval -gold $TrainGold -in $c.links.gz >$c.eval";
	system ("$remote '$comm'");
    }

#------------------------------------------------
# create basic clues (dice+sim)

    if ((not -e "data/runtime/str.dbm") or 
	(not -e "data/runtime/dice.dbm")){
	print "create basic clues!\n";
	system("$uplug systems/align/word/basicclues -in $Bitext");
    }

#------------------------------------------------
# wait for the remote alignments (giza)

    foreach my $c (@gizaclues){
	print "waiting for $c.eval!\n";
	while (not -e "$c.eval"){sleep 1;}
    }
}



###########################################################
# MakeDynamicClues
#
#------------------------------------------------
# learn dynamic clues from some previous links
#------------------------------------------------

sub MakeDynamicClues{

    my $file=shift;
    my $thisdir=getcwd;                         # the current dir
    my $links="$thisdir/__dynamic.links";        # training data

    print "align $Bitext with $file!\n";   # (this should be done in a common
    my %setup=();                          #  sub-routine like in 
    &File2Setting(\%setup,$file);          #  RunClueAligner)
    my $param='';
    if (ref($setup{clues}) ne 'HASH'){return;}
    foreach (keys %{$setup{clues}}){
	if ($setup{clues}{$_}){
	    $param.='-'.$_.' ';
	    $param.='-'.$_.'_w '.$setup{clues}{$_}.' ';
	}
    }
    if ($setup{score}){$param.=" -min $setup{score}";}
    my $comm="nice $align -in $Bitext $param -out $links 2>__dynamic.log";
    $comm.=";touch __dynamic.ready";
    system ("$remote '$comm'");


    print "wait for __dynamic.ready!\n"; 
    while (not -e '__dynamic.ready'){sleep 1;}


    print "learn dynamic clues from __dynamic.links\n";
    if (not -e 'dynamic'){mkdir 'dynamic',0755;}     # training dir
    chdir 'dynamic';                            #   (go there)
    my $comm="$dynamic -in $links >dyn$DynNr.out 2>dyn$DynNr.log";
    $comm.=";touch dyn$DynNr.ready;";
    system ("$remote '$comm'");

    print "wait for the new dynamic clues to be finished\n";
    while (not -e "dyn$DynNr.ready"){sleep 1;}  # wait until ready
    unlink "dyn$DynNr.ready";
    chdir $thisdir;                             #   (go back)

    #-------------------------------------
    # wait until all alignment processes
    # using dynamic clues are finished
    # (this should be probably checked more carefully)
    #-------------------------------------
    print "wait for processes using old dynamic clues!\n";
    while (&DynIsRunning){sleep 1;}
    #-------------------------------------
    # move dynamic clues
    # (overwrite old ones!!!)
    #-------------------------------------
    system ("mv dynamic/data/runtime/*dbm* data/runtime/");
    system ("touch $file.dyn");                 # create a flag
    #-------------------------------------

}