#!/usr/bin/perl
#--------------------------------------------------------------------------
# -*-perl-*-
#
# Copyright (C) 2004 Jörg Tiedemann <joerg@stp.ling.uu.se>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# $Id$
#--------------------------------------------------------------------------
# USAGE: optimize [OPTIONS] bitext.ces align.gold [src-lang [trg-lang]]
#
# OPTIONS
#
# -tb bitext.train.... bitext for training (xces align format)
# -tg train.gold ..... gold standard for training
# -eb bitext.test .... bitext for evaluation (default=same as training)
# -eg test.gold ...... gold standard for evaluation (default=tg)
# -hb bitext.heldout . held-out data for evaluation
# -hg heldout.gold ... gold standard for heldout data
# -l nr .............. learn dynamic clues after each <nr> alignments
# -a ................. always learn dynamic clues (even if improvements
# encountered during the previous generation)
# -m nr .............. keep at most <nr> individuals for producing new
# offspring
# -r nr .............. start (run) at most <nr> amounts of new
# alignment processes (children)
#
#
# bitext.ces ..... sentence alignment file a la XCES (used for estimating
# clues and asa default bitext if no other ones specified)
# align.gold ..... default gold standard (for training and evaluation if
# no other ones specified)
# src-lang ....... source language id, e.g. 'sv' (optional)
# trg-lang ....... source language id, e.g. 'en' (optional)
#
# * heldout data should be distinct from all other data (bitext.ces,
# bitext.train and bitext.test)
#
# * train.gold and test.gold should be disjoint (but may be included in the
# same data set from which the clues are derived (bitext.ces))
#
# * bitext.train and bitext.test should only include sentence pairs for which
# links are defined in the according gold standards (to save processing time)
#
# * word alignment and evaluation (fitness) will be run for training data
# alignment and evaluation for testing and heldout data is only done if
# they are specified (how obvious ...)
#
# * probability of mutation operations is hard-coded !!!!! (see Mutate sub)
# * add random clue = 0.5
# * increase clue weight of randomly chosen clue = 0.167
# * decrease clue weight of randomly chosen clue = 0.167
# * increase overall score threshold = 0.067
# * decrease overall score threshold = 0.067
# * remove random clue = 0.067
# * evaluation results will be stored in a separate sub-directory EVAL
# which will be created
# (if non-existsing) in the current directory
# * evaluations of the held-out data will be stored in a separate
# HELDOUT subdirectory
# (which will also be created if it does not exist already)
#
#--------------------------------------------------------------------------
# run lots of remote alignments
# config-files: uplug/systems/align/word/test/
#--------------------------------------------------------------------------
# this script
# * creates word alignment clues
# - basic clues
# - giza clues (requires GIZA++)
# - dynamic clues from time to time
# (each time more than $LearnStep new
# scores have been counted)
# * runs the clue aligner remotely (requires remote-scripts)
# (max $RunMax processes at a time)
# * evaluates alignments using a gold standard
# * collects scores
# * modifies clue settings
# - deletes clues ($StartWithAllClues=1)
# - adds clues ($StartWithAllClues=0)
# - changes clue weights (if no other modification is possible or
# $ModifyWeightsOnly=1)
# * only the 10 best settings are kept and modified
# (can be changed: look at $Keepmax)
#
#--------------------------------------------------------------------------
use FindBin qw($Bin);
use strict;
use Cwd;
use File::Basename;
my $StartWithAllClues=0; # =1 --> start with all available clues
my $ModifyWeightsOnly=0; # =1 --> modify weights only (don't add/delete clues)
my $RunMax=10; # max number of remote processes
my $KeepMax=10; # keep the best KeepMax settings before modfiying
my $ClueWeightSteps=0.01; # steps for changing clue weights
my $LearnStep=100; # how often to learn new dynamic clues
my $maxNotImpr=25; # stop after a certain number of clue-learning
# without any improvement
my $DefWeight=0.05; # default weight for clues
my $DefScoreThr=0.02; # default score threshold
my $ScoreThrSteps=0.005; # steps for changing score threshold
my $EvalStep=50; # evaluation using EvalGold after X alignment runs
#my $LearnStep=50000000; # (basically don't do it ....)
########################################################################
##
## get the optional arguments
##
my $TrainBitext;
my $TrainGold;
my $EvalBitext;
my $EvalGold;
my $HeldoutBitext;
my $HeldoutGold;
my $Bitext;
my $AlwaysDyn=0; # =0 --> learn dynamic clues only if no improvements
while ($ARGV[0]=~/^\-/){ # get some script options
my $opt=shift(@ARGV);
if ($opt eq '-tb'){$TrainBitext=shift(@ARGV);}
elsif ($opt eq '-tg'){$TrainGold=shift(@ARGV);}
elsif ($opt eq '-eb'){$EvalBitext=shift(@ARGV);}
elsif ($opt eq '-eg'){$EvalGold=shift(@ARGV);}
elsif ($opt eq '-hb'){$HeldoutBitext=shift(@ARGV);}
elsif ($opt eq '-hg'){$HeldoutGold=shift(@ARGV);}
elsif ($opt eq '-a'){$AlwaysDyn=1;} # always learn dynamic clues
elsif ($opt eq '-l'){$LearnStep=shift(@ARGV);} # learn dynamic clues each X align
elsif ($opt eq '-r'){$RunMax=shift(@ARGV);} # run max this nr of children
elsif ($opt eq '-m'){$KeepMax=shift(@ARGV);} # size of population
}
##
## get required arguments and optional language arguments
##
my $srclang;
my $trglang;
if (@ARGV<2){
die "\nusage: optimize corpus gold-standard [srclang [trglang]]\n\n";
}
my $Bitext=shift @ARGV;
my $GoldLinks=shift @ARGV;
if (not -e $Bitext){die "cannot find corpus '$Bitext'!\n";}
if (not -e $GoldLinks){die "cannot find gold standard '$GoldLinks'!\n";}
if (@ARGV){$srclang=shift @ARGV;}
if (@ARGV){$trglang=shift @ARGV;}
########################################################################
my $thisdir=getcwd;
if (dirname($GoldLinks)!~/^\//){$GoldLinks="$thisdir/$GoldLinks";}
if (dirname($Bitext)!~/^\//){$Bitext="$thisdir/$Bitext";}
if (not $EvalBitext){$EvalBitext=$Bitext;}
if (not $TrainGold){$TrainGold=$GoldLinks;}
if (not $EvalGold){$EvalGold=$TrainGold;}
if (not $TrainBitext){$TrainBitext=$Bitext;}
########################################################################
my $UplugHome="$Bin/../..";
my $uplug=$UplugHome.'/uplug';
my $LinkConfig='systems/align/word/test/link';
my $align=$uplug.' '.$LinkConfig;
my $eval=$UplugHome.'/bin/evalalign.pl';
my $summarize=$Bin.'/summarize-eval';
my $remote=$ENV{HOME}.'/cvs/remote/remote';
my @gizaclues=('giza-word-prefix',
'giza-word-prefix-i',
'giza-word',
'giza-word-i',
'giza-pos',
'giza-pos-i',
'giza-pos-word',
'giza-pos-word-i',
# 'giza-word-prefix1',
# 'giza-word-prefix2',
# 'giza-word-prefix4',
'giza-word-suffix',
'giza-word-suffix-i'
);
my @dynclues=(
# 'dp',
## 'dpp',
# 'dc',
# 'dx',
'dl', # lex
'dlp', # lex+pos
'dpx', # pos+position
'dp3', # pos-trigram
'dp3x', # pos-trigram+position
# 'dc3', # chunk-trigram
# 'dc3p', # chunk-trigram+pos
'dc3x' # chunk-trigram+position
);
my $dynpar=join (' 1 -',@dynclues);
my $dynamic="$uplug systems/align/word/dynamicclues -$dynpar 1";
my $DynNr;
my %StatAlignClues=(
'dice' => 'dice.dbm',
'sim' => 'str.dbm',
'gw' => 'giza-word.dbm',
'gwi' => 'giza-word-i.dbm',
'gp' => 'giza-pos.dbm',
'gpi' => 'giza-pos-i.dbm',
'gpw' => 'giza-pos-word.dbm',
'gpwi' => 'giza-pos-word-i.dbm',
'gwp' => 'giza-word-prefix.dbm',
'gwpi' => 'giza-word-prefix-i.dbm',
# 'gwp1' => 'giza-word-prefix1.dbm',
# 'gwp2' => 'giza-word-prefix2.dbm',
# 'gwp4' => 'giza-word-prefix4.dbm',
'gws' => 'giza-word-suffix.dbm',
'gwsi' => 'giza-word-suffix-i.dbm'
);
my %DynAlignClues=(
'dp' => 'pos.dbm',
# 'dpp' => 'pos_coarse.dbm',
'dc' => 'chunk.dbm',
'dx' => 'position.dbm',
'dl' => 'lex.dbm',
'dlp' => 'lexpos.dbm',
'dpx' => 'posposi.dbm',
'dp3' => 'postri.dbm',
'dp3x' => 'postriposi.dbm',
'dc3' => 'chunktri.dbm',
'dc3p' => 'chunktripos.dbm',
'dc3x' => 'chunktriposi.dbm'
);
#########################################################################
########################################################################
my $NrRunning=0; # process counter
my $NrClueAlign=0; # total number of alignment attempts
my @Setting; # the set of settings
my %Running; # running processes (indexed by filnames)
my %Tried; # settings tried already (filenames)
my %Scores; # scores
$Scores{F}={};
my $GoldSize=&GetGoldSize($TrainGold); # check the size of the gold standard
#########################################################################
#########################################################################
&MakeEstimatedClues(); # estimate clues
my @AvailableClues=&GetAvailableClues(); # get available clues
&CheckPrevious(\@Setting); # check previous aligment runs
if ($StartWithAllClues){
&RunAllClues(\@Setting,\@AvailableClues); # run with all clues
for (2..$RunMax){ # make $RunMax copies
push (@Setting,{}); # of the base setting
%{$Setting[-1]{clues}}=%{$Setting[-2]{clues}}; # (to be modified)
}
}
else{
if (not @Setting){ # if Settings is empty:
&RunSingleClues(\@Setting,\@AvailableClues); # run single clue settings
}
}
#----------------------------------------------------------------------
#########################################################################
#########################################################################
### this is the main loop ###############################################
#########################################################################
#########################################################################
my $bestScore=0; # best F-score so far
my $lastDyn; # last link-setting used for learning clues
my $countNotImpr=0; # count number loops without any improvement
my $modified=0; # number of modified settings
my $NrScores=keys %{$Scores{F}}; # nr results
my $nextDyn=(int($NrScores/$LearnStep)+1)*$LearnStep; # next time for learning clues
my $nextEval=(int($NrScores/$EvalStep)+1)*$EvalStep; # next time for evaluation
# my $nextDyn=keys %{$Scores{F}}; # next time to learn clues
# $nextDyn+=int()$LearnStep; # from the best alignment
# my $nextEval=keys %{$Scores{F}}; # next time for doing an evaluation
# $nextEval+=$EvalStep; # with the second gold-standard
do {
&GetBestSettings(\@Setting,$KeepMax); # get the $KeepMax best results
$modified=0;
foreach (0..$#Setting){ # try to modify existing settings
if ($StartWithAllClues){
if ($ModifyWeightsOnly){
if (&ModifyClueWeights($Setting[$_])){$modified++;}
}
else{
if (&ModifySetting2($Setting[$_])){$modified++;}
}
}
else{
if ($ModifyWeightsOnly){
if (&ModifyClueWeights($Setting[$_])){$modified++;}
}
else{
# if (&ModifySetting($Setting[$_])){$modified++;}
if (&Mutate($Setting[$_])){$modified++;}
}
}
}
if ($modified){ # if there are any new settings
&RunAllSettings(\@Setting); # run them!
}
#--------------------------------------------------
# learn dynamic clues from time to time
# (if more than $nextDyn scores stored in the result-table)
# 1) more than $LearnStep scores
# 2) more than 2*$LearnStep scores
# 3) more than 3*$LearnStep scores
# ....
my $NrScores=keys %{$Scores{F}};
if ($NrScores>$nextDyn){ # learn new dynamic clues
$nextDyn+=$LearnStep;
print "## top-list after $NrScores alignment runs (next=$nextDyn):\n";
&PrintScores();
my ($best)=sort {$Scores{F}{$b} <=> $Scores{F}{$a}} keys %{$Scores{F}};
my $score=$Scores{F}{$best}; # the best score so far
##########################################################################
#### old !!!!!!!!!!!!!!!!!!!!!!!!!!!
##########################################################################
# #-------------------------
# # if the score has been improved since last check
#
# if ($score<=$bestScore){ # not improved
# $countNotImpr++; # increment counter
# if ($countNotImpr>$maxNotImpr){ # that's enough
# &FinishUp(); # finish up now!
# }
# }
# else{
# $bestScore=$score; # improved? yes!->save it!
# if ($best and
# (not -e "$best.dyn") and # don't do it twice
# (-e "$best.links.gz")){ # links exist
##########################################################################
#### old up to here !!!!!!!!!!!!!!!!!!!!!!!!!!!
##########################################################################
#-----------------------------------------------------------------
# if the score has not been improved since last check
#-----------------------------------------------------------------
if (($score<=$bestScore) or $AlwaysDyn){ # if not improved (or always-learn-flag)
#---------------------------
# no improvement since
# last time we learned clues
#---------------------------
if ($Scores{F}{$lastDyn}>=$score){
$countNotImpr++; # increment counter
if ($countNotImpr>$maxNotImpr){ # that's enough
&FinishUp(); # finish up now!
}
}
#--------------------------------------
# improvement since last clue learning:
# learn clues from the best one!!!
#--------------------------------------
elsif ($best and (not -e "$best.dyn")){ # not yet done?
$DynNr++; # increment counter
$lastDyn=$best; # save link setting
&MakeDynamicClues($best);
#-------------------------------------
if ($StartWithAllClues){ # if StartWithAll:
&GetBestSettings(\@Setting,$KeepMax); # get top results
foreach my $s (@Setting){ # add all clues
&AllClueSetting($s); # to each setting
} # and ...
&RunAllSettings(\@Setting); # ... run them!
# push (@Setting,{clues=>{}}); # create a setting
# &AllClueSetting($Setting[-1]); # with all clues
}
}
}
#-----------------------------------------------------------------
# if the score has been improved since last check: save the score!
# (don't do anything else)
#-----------------------------------------------------------------
else{
$bestScore=$score;
}
}
# -----------------
sleep 1;
}
until ((not $modified) and (not keys %Running));
#-------------------------------------------------------------
&FinishUp(); # wait until all running processes are finished
#-------------------------------------------------------------
exit;
#########################################################################
#########################################################################
### this is the end #####################################################
#########################################################################
#########################################################################
use File::stat;
sub DynIsRunning{
my @dynamic=keys %Running;
@dynamic=grep (/^dyn/,@dynamic);
foreach (@dynamic){
if (-e "$_.links"){
my $fstat=stat("$_.links"); # check file statistics
my $mtime=time-$fstat->mtime; # time - last-modification-time
if ($mtime>600){ # no modification since 10 min?
unlink ("$_.links"); # --> remove the file!
delete $Running{$_}; # --> remove from Running
} # otherwise:
else{return 1;} # yes, still running!
}
}
return 0;
}
#-------------------------
# check previous runs
# read their scores
# save their settings
#-------------------------
sub CheckPrevious{
my $settings=shift;
opendir(DIR, '.');
my @eval = grep { /\.eval/ } readdir(DIR);
@eval = grep { $_!~/giza/ } @eval;
closedir DIR;
foreach my $file (@eval){
$file=~s/\.eval//;
if ($file=~/dyn([0-9]+)\_/){ # if there are already some dynamic
if ($1>$DynNr){$DynNr=$1;} # clues: check the DynNr
}
push(@{$settings},{});
&File2Setting($settings->[-1],$file);
$Tried{$file}=1;
system "touch $file.ready";
&CheckScore($settings,$file);
unlink "$file.ready";
}
}
#-------------------------
sub FinishUp{
my $running=scalar keys %Running;
while ($running){
$running=0;
foreach my $r (keys %Running){
if (&CheckScore(\@Setting,$r)){
$NrRunning--;
delete $Running{$r};
}
elsif (-e "$r.links"){$running++;}
}
sleep 1;
}
foreach my $r (keys %Running){
print "warn: failed to run $r\n!";
}
my $NrScores=keys %{$Scores{F}};
print "## final top-list after $NrScores alignment runs:\n";
&PrintScores();
}
#########################################################################
###### this is the end ##################################################
#########################################################################
#########################################################################
sub GetBestSettings{
my $set=shift;
my $max=shift;
##-----------------------------------------------------------
## wait for all currently running processes to be finished
## (comment this out if fast processes should be preferred!)
#
# while (keys %Running){
foreach my $r (keys %Running){
if (&CheckScore($set,$r)){
$NrRunning--;
delete $Running{$r};
}
}
# sleep 1;
# }
##-----------------------------------------------------------
my @best=sort {$Scores{F}{$b} <=> $Scores{F}{$a}} keys %{$Scores{F}};
if ($#best<$max){return;}
@{$set}=();
my $count=0;
while (@best){
my $file=shift(@best);
$count++; # count the number of settings
if ($count>$max){ # keep only the $max best settings!
foreach (@best){ # remove all other link files:
if (-e "$_.links.gz"){unlink "$_.links.gz";}
if (-e "$_.links"){unlink "$_.links";}
}
return;
}
push(@{$set},{});
&File2Setting($set->[-1],$file);
}
}
###########################################################
sub RunSingleClues{
my ($sets,$clues)=@_;
while (@{$clues}){ # create inital settings
my $clue=shift(@{$clues}); # foreach available clue
push (@{$sets},{clues=>{$clue=>$DefWeight}, # create settings with
lastchange=>'+'.$clue, # single clues
modified=>1});
}
&RunAllSettings($sets); # run all clue alignment settings
}
sub RunAllClues{
my ($sets,$clues)=@_;
push (@{$sets},{clues=>{}});
&AllClueSetting($sets->[-1]);
&RunAllSettings($sets); # run all clue alignment settings
}
###########################################################
# RunAllSettings
#
# check evaluation files first
# then run all settings in the settings-array
sub RunAllSettings{
my $settings=shift;
print "Running: ";
print scalar keys %Running;
print ", Scores: ";
print scalar keys %{$Scores{F}};
print ", NrClues: ";
print scalar @AvailableClues;
print ", NrClueAlign: $NrClueAlign, NrRunning: $NrRunning";
#-----------------------------------------------
do {
if (keys %Running){
foreach my $r (keys %Running){
if (&CheckScore($settings,$r)){
$NrRunning--;
delete $Running{$r};
}
}
}
sleep 1;
}
until ($NrRunning<$RunMax);
#-----------------------------------------------
# print "running: ";
# print scalar keys %Running;
print " ($NrRunning)\n";
# print "";
foreach my $s (@{$settings}){
if ($s->{modified}){
my $file=&Setting2File($s);
system "touch $file.ready";
if (&CheckScore($settings,$file)){next;}
unlink "$file.ready";
my $file=&RunClueAligner($s);
# $NrRunning++;
# $Running{$file}=$s->{lastchange};
}
}
$NrRunning=scalar keys %Running;
}
###########################################################
# take all available clues!!!!
sub AllClueSetting{
my $s=shift;
my @AllClues=&GetAvailableClues(); # get all available clues
@AvailableClues=@AllClues;
foreach (@AllClues){ # for all clue types
if (not defined $s->{clues}->{$_}){ # if not included already:
$s->{clues}->{$_}=$DefWeight; # set default weight
}
}
my $file=&Setting2File($s); # get the filebase
if ((defined $Scores{F}{$file}) or # if tested already or
(defined $Tried{$file})){ # if already running:
delete $s->{clues}->{$_}; # don't try again
}
else{ # otherwise:
$s->{modified}=1; # give it a shot
return 1;
}
return 0;
}
###########################################################
# modify clue alignment settings
# change clue weights ONLY!!!!!
sub ModifyClueWeights{
my $s=shift;
#----------------
# check without modification
# (might be ok if new dynamic clues exist!
#----------------
my $file=&Setting2File($s); # get the filebase
if ((not defined $Scores{F}{$file}) # if tested already or
and (not defined $Tried{$file})){ # if already running:
$s->{modified}=1; # give it a shot
return 1; # (mark as modified)
}
my $step=$ClueWeightSteps;
my @clues=keys %{$s->{clues}};
my $file;
do {
my $idx=int(rand $#clues+1); # random index number
my $clue=$clues[$idx]; # (choose random clue)
my $sign=-1; # decrease
if (int(rand 2)){$sign=1;} # or increase
$s->{clues}->{$clue}+=$sign*$step; # change the weight
$s->{clues}->{$clue}=
(int(100*$s->{clues}->{$clue}))/100;
$file=&Setting2File($s); # convert to file base
}
until ((not defined $Scores{F}{$file}) # do this until a new setting
and (not defined $Tried{$file})); # is found!
$s->{modified}=1; # try it (mark as modified
return 1; # and return)
}
# end of modify setting
###########################################################
###########################################################
# modify clue alignment settings (version 2)
#
# 1) delete available clues
# 2) change clue weights
sub ModifySetting2{
my $s=shift;
#----------------
# check without modification
# (might be ok if new dynamic clues exist!
#----------------
my $file=&Setting2File($s); # get the filebase
if ((not defined $Scores{F}{$file}) # if tested already or
and (not defined $Tried{$file})){ # if already running:
$s->{modified}=1; # give it a shot
return 1; # (mark as modified)
}
my @AllClues=&GetAvailableClues(); # get all available clues
@AvailableClues=@AllClues;
## foreach my $c (@AllClues){ # for all clue types
while (@AllClues){ # as long as there are clues
my $idx=int(rand $#AllClues+1); # random index number
my $c=splice(@AllClues,$idx,1); # get/remove the clue
if (defined $s->{clues}->{$c}){ # if included already:
my $weight=$s->{clues}->{$c}; # save the weight
delete $s->{clues}->{$c}; # delete the clue
my $file=&Setting2File($s); # get the filebase
if ((defined $Scores{F}{$file}) or # if tested already or
(defined $Tried{$file})){ # if already running:
$s->{clues}->{$c}=$weight; # put it back again
}
else{ # otherwise:
$s->{modified}=1; # give it a shot
return 1; # (mark as modified)
}
}
}
#-----------------------------
# deleting clues didn't work
# --> try to change weights!
#-----------------------------
my $step=$ClueWeightSteps;
my @clues=keys %{$s->{clues}};
my $file;
do {
my $idx=int(rand $#clues+1); # random index number
my $clue=$clues[$idx]; # (choose random clue)
my $sign=-1; # decrease
if (int(rand 2)){$sign=1;} # or increase
$s->{clues}->{$clue}+=$sign*$step; # change the weight
$s->{clues}->{$clue}=
(int(100*$s->{clues}->{$clue}))/100;
$file=&Setting2File($s); # convert to file base
}
until ((not defined $Scores{F}{$file}) # do this until a new setting
and (not defined $Tried{$file})); # is found!
$s->{modified}=1; # try it (mark as modified
return 1; # and return)
}
# end of modify setting
###########################################################
###########################################################
# modify clue alignment settings
#
# 1) add available clues
# 2) change clue weights
sub ModifySetting{
my $s=shift;
#----------------
# check without modification
# (might be ok if new dynamic clues exist!
#----------------
my $file=&Setting2File($s); # get the filebase
if ((not defined $Scores{F}{$file}) # if tested already or
and (not defined $Tried{$file})){ # if already running:
$s->{modified}=1; # give it a shot
return 1; # (mark as modified)
}
my @AllClues=&GetAvailableClues(); # get all available clues
@AvailableClues=@AllClues;
## foreach my $c (@AllClues){ # for all clue types
while (@AllClues){ # as long as there are clues
my $idx=int(rand $#AllClues+1); # random index number
my $c=splice(@AllClues,$idx,1); # get/remove the clue
if (not defined $s->{clues}->{$c}){ # if not included already:
$s->{clues}->{$c}=$DefWeight; # set default weight
my $file=&Setting2File($s); # get the filebase
if ((defined $Scores{F}{$file}) or # if tested already or
(defined $Tried{$file})){ # if already running:
delete $s->{clues}->{$c}; # don't try again
}
else{ # otherwise:
$s->{modified}=1; # give it a shot
return 1; # (mark as modified)
}
}
}
my $step=$ClueWeightSteps;
my @clues=keys %{$s->{clues}};
my $file;
do {
my $idx=int(rand $#clues+1); # random index number
my $clue=$clues[$idx]; # (choose random clue)
my $sign=-1; # decrease
if (int(rand 2)){$sign=1;} # or increase
$s->{clues}->{$clue}+=$sign*$step; # change the weight
$s->{clues}->{$clue}=
(int(100*$s->{clues}->{$clue}))/100;
$file=&Setting2File($s); # convert to file base
}
until ((not defined $Scores{F}{$file}) # do this until a new setting
and (not defined $Tried{$file})); # is found!
$s->{modified}=1; # try it (mark as modified
return 1; # and return)
}
# end of modify setting
###########################################################
###########################################################
# modify clue alignment settings
#
# 1) add available clues
# 2) change clue weights
sub Mutate{
my $s=shift;
#----------------
# check without modification
# (might be ok if new dynamic clues exist!
#----------------
my $file=&Setting2File($s); # get the filebase
if ((not defined $Scores{F}{$file}) # if tested already or
and (not defined $Tried{$file})){ # if already running:
$s->{modified}=1; # give it a shot
return 1; # (mark as modified)
}
my @AllClues=&GetAvailableClues(); # get all available clues
@AvailableClues=@AllClues;
# 0 --> add clue
# 1 --> increase weight
# 2 --> decrease weight
# 3 --> increase minimal score
# 4 --> decrease minimal score
# 5 --> remove clue
while (1){
my $operation=int(rand 60);
if ($operation<30){
if (&AddRandomClue($s,@AllClues)){
return 1;
}
}
elsif ($operation<40){
if (&ChangeRandomClueWeight($s,$ClueWeightSteps)){
return 1;
}
}
elsif ($operation<50){
if (&ChangeRandomClueWeight($s,0-$ClueWeightSteps)){
return 1;
}
}
elsif ($operation<54){
if (&ChangeScoreThreshold($s,$ScoreThrSteps)){
return 1;
}
}
elsif ($operation<58){
if (&ChangeScoreThreshold($s,0-$ScoreThrSteps)){
return 1;
}
}
elsif ($operation<60){
if (&RemoveRandomClue($s)){
return 1;
}
}
}
}
sub AddRandomClue{
my $s=shift;
my @AllClues=@_;
while (@AllClues){
my $idx=int(rand $#AllClues+1); # random index number
my $c=splice(@AllClues,$idx,1); # get/remove the clue
if (not defined $s->{clues}->{$c}){ # if not included already:
$s->{clues}->{$c}=$DefWeight; # set default weight
my $file=&Setting2File($s); # get the filebase
if ((defined $Scores{F}{$file}) or # if tested already or
(defined $Tried{$file})){ # if already running:
delete $s->{clues}->{$c}; # don't try again
}
else{ # otherwise:
$s->{modified}=1; # give it a shot
return 1; # (mark as modified)
}
}
}
return 0;
}
sub RemoveRandomClue{
my $s=shift;
my @AllClues=keys %{$s->{clues}};
my $file;
while ($#AllClues){ # as long as there are >2 clues:
my $idx=int(rand $#AllClues+1); # choose a clue
my $c=splice(@AllClues,$idx,1); # get the clue name
my $old=$s->{clues}->{$c}; # save the old weight
delete $s->{clues}->{$c}; # delete the clue
my $file=&Setting2File($s); # get the filebase
if ((defined $Scores{F}{$file}) or # if tested already or
(defined $Tried{$file})){ # if already running:
$s->{clues}->{$c}=$old; # restore clue
}
else{ # otherwise:
$s->{modified}=1; # give it a shot
return 1; # (mark as modified)
}
}
return 0;
}
sub ChangeRandomClueWeight{
my $s=shift;
my $step=shift;
my @clues=keys %{$s->{clues}};
my $file;
# do {
my $idx=int(rand $#clues+1); # random index number
my $clue=$clues[$idx]; # (choose random clue)
my $old=$s->{clues}->{$clue}; # save the old weight
$s->{clues}->{$clue}+=$step; # change the weight
$s->{clues}->{$clue}=
(int(100*$s->{clues}->{$clue}))/100;
$file=&Setting2File($s); # convert to file base
# }
# until ((not defined $Scores{F}{$file}) #
# and (not defined $Tried{$file}));
# $s->{modified}=1;
# return 1;
if ((not defined $Scores{F}{$file}) #
and (not defined $Tried{$file})){ #
$s->{modified}=1;
return 1;
}
$s->{clues}->{$clue}=$old; # restore the old weight
return 0;
}
sub ChangeScoreThreshold{
my $s=shift;
my $step=shift;
my $file;
# do{
if (not defined $s->{score}){
$s->{score}=$DefScoreThr;
}
else{
$s->{score}+=$step;
}
if ($s->{score}<=0){
delete $s->{score};
}
$file=&Setting2File($s); # convert to file base
# }
# until ((not defined $Scores{F}{$file}) #
# and (not defined $Tried{$file}));
# $s->{modified}=1;
# return 1;
if ((not defined $Scores{F}{$file}) #
and (not defined $Tried{$file})){ #
$s->{modified}=1;
return 1;
}
return 0;
}
# end of modify setting
###########################################################
###########################################################
# check scores:
#
# check if there's an eval-file with some scores
# save scores in the Score-hash
# restart the clue aligner if something went wrong
sub CheckScore{
my $setting=shift;
my $file=shift;
if (-e "$file.ready" && -e "$file.eval"){
# if (-e "$file.eval"){
open (E, "$file.eval");
my @link = <E>;
close E;
#-------------------------------------------------------
# check the size of the evaluation file
# and compare it to the size of the gold standard
# if they are not identical --> something went wrong
# --> re-run the alignment!
my ($size)=grep /size of gold standard/,@link;
$size=~s/^.*size of gold standard:\s*([0-9]+)\,.*$/$1/;
if (($size<$GoldSize) or ($size>$GoldSize+5)){
if (-e "$file.links.gz"){unlink "$file.links.gz";}
if (-e "$file.links"){unlink "$file.links";}
unlink "$file.eval"; # delete old eval-file
unlink "$file.ready"; # delete ready-flag
delete $Running{$file}; # delete from running
delete $Tried{$file};
delete $Scores{F}{$file}; # delete scores
# sleep 1; # JT 16-11-2004:
# push (@{$setting},{}); # do not restart alignments
# &File2Setting($setting->[-1],$file); # in case of corrupted
# $setting->[-1]->{modified}=1; # evaluation files (it may
# &RunClueAligner($setting->[-1]); # cause conflicts!!!!)
return 0;
}
#-------------------------------------------------------
if ($link[-1]=~/F:\s+([0-9]+\.[0-9]+)\%/){ # save the F-score
$Scores{F}{$file}=$1;
if ($link[-2]=~/precision:\s+([0-9]+\.[0-9]+)\%/){ # precision
$Scores{P}{$file}=$1;
}
if ($link[-3]=~/recall:\s+([0-9]+\.[0-9]+)\%/){ # precision
$Scores{R}{$file}=$1;
}
if (-e "$file.log"){
open (L, "$file.log");
my ($time) = grep {/processing time:/} <L>;
close L;
if ($time=~/processing time:\s*([0-9].*)$/){
$time=$1;
while ($time=~s/^\s*0:\s*//){};
$Scores{time}{$file}=$time;
}
unlink "$file.log";
}
unlink "$file.ready";
return 1;
}
}
return 0;
}
# end of CheckScores
###########################################################
###########################################################
# PrintScore: print the current score table
sub PrintScores{
if (not keys %Scores){return;}
print "----------------------------------------------------\n";
printf "%5s %5s %5s %8s %s\n",'P','R','F','time','name';
print "----------------------------------------------------\n";
foreach (sort {$Scores{F}{$b} <=> $Scores{F}{$a}} keys %{$Scores{F}}){
printf "%3.2f %3.2f %3.2f %8s %s\n",
$Scores{P}{$_},$Scores{R}{$_},$Scores{F}{$_},$Scores{time}{$_},$_;
}
print "----------------------------------------------------\n";
}
###########################################################
# GetGoldSize: get the size of the gold standard
sub GetGoldSize{
my $file=shift;
open (G, $file);
my @links = grep {/\<(wordLink|link)\s/} <G>;
close G;
my %uniq=(); # there are some copies in some
map ($uniq{$_}=1,@links); # gold standard files --> count only
return scalar keys %uniq; # unique links!
# return scalar @links;
}
###########################################################
# GetAvailableClues: get all available clues
# (check if dbm-files exist)
sub GetAvailableClues{
my @avail=();
foreach (keys %StatAlignClues){
if (-e "data/runtime/$StatAlignClues{$_}"){
push (@avail,$_);
}
}
foreach (keys %DynAlignClues){
if (-e "data/runtime/$DynAlignClues{$_}"){
push (@avail,$_);
}
}
if (defined $srclang && defined $trglang){
if (open (C, "$UplugHome/$LinkConfig")){
my @config=<C>;
close C;
if (grep {/\'${srclang}${trglang}p\'/} @config){
push (@avail,"${srclang}${trglang}p");
}
if (grep {/\'${srclang}${trglang}pp\'/} @config){
push (@avail,"${srclang}${trglang}pp");
}
if (grep {/\'${srclang}${trglang}c\'/} @config){
push (@avail,"${srclang}${trglang}c");
}
}
}
return @avail;
}
########################################################################
########################################################################
########################################################################
sub MakeLinkSubDir{
use File::Basename;
my $links=shift;
my $dir=$links;
# $dir=~s/\.links.gz//;
# $dir=uc($dir);
mkdir $dir,0755;
my ($name,$path)=fileparse($links);
link $links,"$dir/$name";
return $dir;
}
#-------------------
# MakeClueLinks: link non-existing clues to data/runtime
# in a given sub-directory
sub MakeClueLinks{
my $dir=shift;
opendir(DIR, 'data/runtime'); # make links to clues
my @dbm = grep { /\.dbm/ } readdir(DIR);
closedir DIR;
mkdir "$dir/data",0755;
mkdir "$dir/data/runtime",0755;
foreach (@dbm){
if (not -e "$dir/data/runtime/$_"){
link "data/runtime/$_","$dir/data/runtime/$_";
}
}
}
###########################################################
# RunClueAligner
#
#-------------------------------------------------------
# run clue aligner
# settings: a set of clues with weights
#-------------------------------------------------------
sub RunClueAligner{
my $setting=shift;
if (ref($setting->{clues}) ne 'HASH'){return;}
my @clues=keys %{$setting->{clues}};
if (not @clues){return;}
my @weights=values %{$setting->{clues}};
my $minscore=$setting->{score};
my $param='';
foreach (keys %{$setting->{clues}}){
# $param.='-'.$_.' ';
if ($setting->{clues}->{$_}){
$param.='-'.$_.' ';
$param.='-'.$_.'_w '.$setting->{clues}->{$_}.' ';
}
}
if ($minscore){$param.=" -min $minscore";}
my $file=&Setting2File($setting);
my $comm="nice $align -in $TrainBitext $param -out $file.links 2>$file.log";
$comm.=";nice gzip -f $file.links";
# $comm.=";nice gzip -f $file.log";
$comm.=";nice $eval -gold $TrainGold -in $file.links.gz >$file.eval";
if (-e $HeldoutBitext){
if ($HeldoutBitext and (not -e 'HELDOUT')){mkdir 'HELDOUT',0755;}
$comm.=";nice $align -in $HeldoutBitext $param -out HELDOUT/$file.links 2>HELDOUT/$file.log";
$comm.=";nice gzip -f HELDOUT/$file.links";
$comm.=";nice gzip -f HELDOUT/$file.log";
if (-e $HeldoutGold){
$comm.=";nice $eval -gold $HeldoutGold -in HELDOUT/$file.links.gz >HELDOUT/$file.eval";
$comm.=";nice rm HELDOUT/$file.links.gz";
}
}
if (-e $EvalBitext){
if ($EvalBitext and (not -e 'EVAL')){mkdir 'EVAL',0755;}
$comm.=";nice $align -in $EvalBitext $param -out EVAL/$file.links 2>EVAL/$file.log";
$comm.=";nice gzip -f EVAL/$file.links";
$comm.=";nice gzip -f EVAL/$file.log";
if (-e $EvalGold){
$comm.=";nice $eval -gold $EvalGold -in EVAL/$file.links.gz >EVAL/$file.eval";
$comm.=";nice rm EVAL/$file.links.gz";
}
}
$comm.=";touch $file.ready";
$setting->{modified}=0;
if ((-e "$file.links") or
(-e "$file.eval") or
(-e "$file.links.gz")){
print "warning: clue alignment '$file' already in progress!!!!\n";
return undef;
}
if (defined $Running{$file}){
# print "warning: clue alignment '$file' already running!!!!\n";
$Running{$file}=1;
return undef;
}
if (defined $Tried{$file}){
print "warning: clue alignment '$file' already tried earlier!!!!\n";
$Tried{$file}=1;
return undef;
}
if (defined $Scores{F}{$file}){
print "warning: there is a score for clue alignment '$file'!!!!\n";
$Tried{$file}=1;
return undef;
}
$Running{$file}=1;
$Tried{$file}=1;
$NrClueAlign++;
# print "run $file\n";
system ("$remote '$comm'");
return $file;
}
# end of RunClueAligner
###########################################################
###########################################################
# make a unique filename for a clue aligner setting
sub Setting2File{
my $setting=shift;
if (ref($setting->{clues}) ne 'HASH'){return undef;}
my $file='C'; # start file names with 'C' (to avoid initial '-')
my $InclDynClue=0;
foreach (sort keys %{$setting->{clues}}){
if (defined $setting->{clues}->{$_}){ # weights before clues!
$file.=$setting->{clues}->{$_};
}
$file.=$_; # clue name
if (defined $DynAlignClues{$_}){$InclDynClue=1;}
$file.='+';
}
chop ($file);
if (not $DynNr){$DynNr=1;}
if ($InclDynClue){$file='dyn'.$DynNr.'__'.$file;} # add dynclue-marker
my $minscore=$setting->{score};
if ($minscore){$file.="_min$minscore";}
return $file;
}
###########################################################
# create the clue aligner setting from a uniqe filename
sub File2Setting{
my $setting=shift;
my $file=shift;
$file=~s/^.*?\_\_//; # remove dynclue-marker
$file=~s/^C//; # remove initial 'C'
if ($file=~s/(\_min)(.*)$//){ # check if there's a score threshold
$setting->{score}=$2;
}
my @clues=split(/\+/,$file); # split into clues
foreach (@clues){
if (/^([0-9\.]*)([^0-9].*)$/){
$setting->{clues}->{$2}=$1;
}
else{
$setting->{clues}->{$_}=$DefWeight;
}
}
}
###########################################################
# MakeEstimatedClues
#
#------------------------------------------------
# create giza clues and basic clues
# and wait until they are finished
#------------------------------------------------
sub MakeEstimatedClues{
#------------------------------------------------
# run giza remotely
print "create estimated clues!\n";
foreach my $c (@gizaclues){
if (-e "$c.eval"){
next;
}
my $comm="$uplug systems/align/word/test/$c -in $Bitext -out $c.links";
$comm.=";gzip -f $c.links;$eval -gold $TrainGold -in $c.links.gz >$c.eval";
system ("$remote '$comm'");
}
#------------------------------------------------
# create basic clues (dice+sim)
if ((not -e "data/runtime/str.dbm") or
(not -e "data/runtime/dice.dbm")){
print "create basic clues!\n";
system("$uplug systems/align/word/basicclues -in $Bitext");
}
#------------------------------------------------
# wait for the remote alignments (giza)
foreach my $c (@gizaclues){
print "waiting for $c.eval!\n";
while (not -e "$c.eval"){sleep 1;}
}
}
###########################################################
# MakeDynamicClues
#
#------------------------------------------------
# learn dynamic clues from some previous links
#------------------------------------------------
sub MakeDynamicClues{
my $file=shift;
my $thisdir=getcwd; # the current dir
my $links="$thisdir/__dynamic.links"; # training data
print "align $Bitext with $file!\n"; # (this should be done in a common
my %setup=(); # sub-routine like in
&File2Setting(\%setup,$file); # RunClueAligner)
my $param='';
if (ref($setup{clues}) ne 'HASH'){return;}
foreach (keys %{$setup{clues}}){
if ($setup{clues}{$_}){
$param.='-'.$_.' ';
$param.='-'.$_.'_w '.$setup{clues}{$_}.' ';
}
}
if ($setup{score}){$param.=" -min $setup{score}";}
my $comm="nice $align -in $Bitext $param -out $links 2>__dynamic.log";
$comm.=";touch __dynamic.ready";
system ("$remote '$comm'");
print "wait for __dynamic.ready!\n";
while (not -e '__dynamic.ready'){sleep 1;}
print "learn dynamic clues from __dynamic.links\n";
if (not -e 'dynamic'){mkdir 'dynamic',0755;} # training dir
chdir 'dynamic'; # (go there)
my $comm="$dynamic -in $links >dyn$DynNr.out 2>dyn$DynNr.log";
$comm.=";touch dyn$DynNr.ready;";
system ("$remote '$comm'");
print "wait for the new dynamic clues to be finished\n";
while (not -e "dyn$DynNr.ready"){sleep 1;} # wait until ready
unlink "dyn$DynNr.ready";
chdir $thisdir; # (go back)
#-------------------------------------
# wait until all alignment processes
# using dynamic clues are finished
# (this should be probably checked more carefully)
#-------------------------------------
print "wait for processes using old dynamic clues!\n";
while (&DynIsRunning){sleep 1;}
#-------------------------------------
# move dynamic clues
# (overwrite old ones!!!)
#-------------------------------------
system ("mv dynamic/data/runtime/*dbm* data/runtime/");
system ("touch $file.dyn"); # create a flag
#-------------------------------------
}