#!/usr/bin/env perl
#-*-perl-*-
#
# OPTIONS
#
#
# INPUT
#----------------------------------------------------------------------------
#  -a align-file ....... aligned treebank
#  -A format ........... treebank alignment format (default = STA)
#
#  -s source-treebank .. source language treebank file (taken from align-file)
#  -S src-format ....... format of the source language treebank (default: tiger)
#  -t target-treebank .. target language treebank file (taken from align-file)
#  -T trg-format ....... format of the target language treebank (default: tiger)
#  -w .................. swap alignment direction
#
#
# TRAINING
#----------------------------------------------------------------------------
#  -f features ......... features to be used in the model
#  -n nrSent ........... nr of sentences to be used for training (default: all)
#  -c classifier ....... classifier to be used (default = megam)
#  -m modelfile ........ file in which the model will be stored (treealign.$opt_c)
#  -M moses-dir ........ directory with Moses alignments & models
#  -L -N ............... align nodes of the same type only
#  -N .................. align non-terminals only
#  -L .................. align terminal nodes only (leafs)
#  -k .................. keep feature file extracted for training
#
#  -d lexe2f ........... Moses lex.e2f file ($opt_M/model/lex.0-0.e2f)
#  -D lexf2e ........... Moses lex.f2e file ($opt_M/model/lex.0-0.f2e)
#  -g giza.e2f ......... GIZA++ alignments ($opt_M/giza.src-trg/src-trg.A3.final.gz)
#  -G giza.f2e ......... GIZA++ alignments ($opt_M/giza.trg-src/trg-src.A3.final.gz)
#  -y SymAlign ......... Moses symmetrized alignment ($opt_M/model/aligned.intersect)
#  -I id-file .......... file with sentence IDs for reading Moses/Giza align
#
#  -C .................. enable linked-children feature
#  -U .................. enable linked-subtree-nodes feature
#  -P .................. enable linked-parent feature
#  -J .................. enable link distance feature (parent <-> current)
#  -R nr_iter .......... SEARN adaptive learning (repeat nr_iter)
#
#
# ALIGNMENT
#---------------------------------------------------------------------------
#  -e nrSent ........... nr of sentences to be aligned
#  -l linksearch ....... link search strategy (default = greedy)
#  -x threshold ........ alignment score threshold
#
#
#  -v .................. verbose output


# use Devel::Leak::Object;
# use Devel::Leak::Object qw(GLOBAL_bless);


use strict;
use FindBin;
use lib $FindBin::Bin.'/../lib';

use vars qw($opt_a $opt_A $opt_s $opt_S $opt_t $opt_T
            $opt_w $opt_f $opt_n $opt_e $opt_x $opt_l $opt_c $opt_m
            $opt_M $opt_N $opt_L $opt_k $opt_R
            $opt_d $opt_D $opt_g $opt_G $opt_y
            $opt_C $opt_U $opt_P $opt_J $opt_I $opt_v
            $opt_1 $opt_2 $opt_3 $opt_4 $opt_u $opt_K $opt_i 
            $opt_X $opt_o $opt_O
            $opt_b $opt_r $opt_B $opt_V
            $opt_5 $opt_6 $opt_7 $opt_8 $opt_9);


use Getopt::Std;
getopts('a:A:s:S:t:T:wf:n:e:x:l:c:m:M:NLd:D:g:G:y:kCPUJI:vR:1:2:3:4:uKiX:b:r:BV:O:o:5:6789');


use Lingua::Align::Trees;

my $featureStr = $opt_f || 'insideST2:insideTS2:outsideST2:outsideTS2';
my $nrTrain = $opt_n;
my $nrDev = $opt_V;
my $classifier = $opt_c || 'megam';
my $modelfile = $opt_m || 'treealign.'.$classifier;

my $nrEval = $opt_e;
my $search = $opt_l || 'greedy';

my $MosesDir = $opt_M || 'moses';

# weights for different types of links
my $WeightS = $opt_1 || '3';               # sure (=good)
my $WeightP = 1;                           # possible (=fuzzy)
if (defined $opt_2){$WeightP=$opt_2;}
my $WeightN = $opt_3 || '1';               # wrong (=negative data)
my $WeightW = 1;                           # new category: weak links!
if (defined $opt_4){$WeightW=$opt_4;}

my $alignment = $opt_b || 'inference';     # alignment strategy 
                                           # (inference|bottom-up)
my $randomNegative = $opt_r || 0;
my $negativeNeighbors = $opt_B || 0;

#--------------------------------------------------------
# if -L && -N ... align nodes of the same type only
# else if -L .... align terminals (leafs) only
# else if -N .... align non-terminals only
# else .......... no restriction

my ($SameType,$NTonly,$Tonly)=(0,0,0);
if ($opt_L && $opt_N){
    $SameType = 1;
}
elsif ($opt_L){
    $Tonly=1;
}
elsif ($opt_N){
    $NTonly=1;
}

# -K --> add links in competitive mode
if ($opt_K){$opt_u = 'compete';}

# features from Moses/Giza

my $lexe2f = $opt_d || $MosesDir.'/model/lex.0-0.e2f';
my $lexf2e = $opt_D || $MosesDir.'/model/lex.0-0.f2e';

my $gizaA3_e2f = $opt_g || $MosesDir.'/giza.src-trg/src-trg.A3.final.gz';
my $gizaA3_f2e = $opt_G || $MosesDir.'/giza.trg-src/trg-src.A3.final.gz';

my $moses_align = $opt_y || $MosesDir.'/model/aligned.intersect';

my $OutputFormat = $opt_O || 'sta';


my $treealigner = new Lingua::Align::Trees(

    -alignment => $alignment,               # type of alignment strategy
    -features => $featureStr,              # features to be used

    -output_format => $OutputFormat,       # output format

    -classifier => $classifier,            # classifier used
    -classifier_weight_sure => $WeightS,   # training: weight for sure links
    -classifier_weight_possible=>$WeightP, # training: weight for possible links
    -classifier_weight_weak=>$WeightW,     # training: weight for weak links
    -classifier_weight_negative=>$WeightN, # training: weight for non-linked

    # some special settings for pruning training and test data
    # - don't use all possible node pairs to create negative data points
    # - classification threshold: for bottom-up alignment
    #
    -random_negative_examples => $randomNegative,  # use random neg. data points
    -negative_neighbors => $negativeNeighbors,     # use only neighbors
#    -score_threshold => 0.3,                      # classification threshold

    -megam           => $opt_X,             # (path to) megam
    -megam_arguments => $opt_o,             # additional megam arguments

#    -megam => '/Users/joerg/work/align/MaxEnt/megam_0.92/megam',
#    -megam => '/storage3/data/PACO-MT/tools/megam_i686.opt',

    -keep_training_data => $opt_k,        # don't remove feature file

    -same_types_only => $SameType,        # link only T&T and nonT&nonT
    -nonterminals_only => $NTonly,        # link non-terminals only
    -terminals_only => $Tonly,            # link terminals only
    -skip_unary => 0,                     # 1:skip nodes with unary productions

    -add_links => $opt_u,                 # "add links" mode

    # add first-order dependency:
    -linked_children => $opt_C,           # (proportion of linked children)
    -linked_subtree => $opt_U,            # (proportion of linked descendents)
    -linked_parent => $opt_P,             # linked parent feature
    -linked_parent_distance => $opt_J,    # distance of parent link

    -linked_neighbors => $opt_5,          # -1:-1+-1:0+0:-1 ...
    -linked_children_proper => $opt_6,
    -linked_subtree_proper => $opt_7,
    -linked_children_inout => $opt_8,
    -linked_subtree_inout => $opt_9,


    -searn => $opt_R,                     # adaptive SEARN training

    -lexe2f => $lexe2f,
    -lexf2e => $lexf2e,

    ## for the GIZA++ word alignment features
    -gizaA3_e2f => $gizaA3_e2f,
    -gizaA3_f2e => $gizaA3_f2e,

    -gizaA3_e2f_ids => $opt_I,
    -gizaA3_f2e_ids => $opt_I,

    ## for the Moses word alignment features
    -moses_align => $moses_align,
    -moses_align_ids => $opt_I,

    -lex_lower => 1,                      # always convert to lower case!
    -min_score => $opt_x,

    -weak_wellformedness => 1,        # use *weak* wellformedness if applicable

    -verbose => $opt_v,

    );


# Devel::Leak::Object::track($treealigner);


# corpus to be used for training

my %corpus = ();
if ($opt_a){
    $corpus{-alignfile} = $opt_a;
    $corpus{-type} = $opt_A || 'sta';
    $corpus{-swap_alignment} = $opt_w if ($opt_w);
}

# check if source and target files/types are given 
# (can overwrite the ones specified in the sentence alignment file)
# --> for example: can use OPUS sentence alignment file but still
#                  the corresponding treebank files somewhere else!

$corpus{-src_file} = $opt_s if ($opt_s);
$corpus{-src_type} = $opt_S if ($opt_S);

$corpus{-trg_file} = $opt_t if ($opt_t);
$corpus{-trg_type} = $opt_T if ($opt_T);

# use index-nodes (AlpinoXML) if opt_i is set! --> otherwise skip them!
if (not $opt_i){
    $corpus{-src_skip_indexed}=1;
    $corpus{-trg_skip_indexed}=1;
}

#-------------------------------------------------------------------
# train a model if nrTrain is given
#-------------------------------------------------------------------

if ((! -e $modelfile) || ($nrTrain)){
    $treealigner->train(\%corpus,$modelfile,$nrTrain,0,$nrDev);
}

#-------------------------------------------------------------------
# align trees if modelfile exists
#-------------------------------------------------------------------

if (-e $modelfile){
    $treealigner->align(\%corpus,$modelfile,$search,$nrEval,$nrTrain+$nrDev);
}





__END__

=head1 NAME

treealign - training tree alignment classifiers and aligning syntactic trees


=head1 SYNOPSIS

    treealign [OPTIONS]

    # train a model from tree aligned data
    treealign -n 100 -m treealign.model -a train-data.xml

    # aligning a parallel treebank
    treealign -m treealign.model -a parallel-treebank.xml > aligned.xml


=head1 DESCRIPTION

This script allows you to train a tree alignment model and to apply them to parallel treebanks. Tree alignment is based on local binary classification and rich feature sets.

Currently, training data has to be in Stockholm Tree Aligner format. The output format is the same format. Here is a short example of this format (taking from the output of the TreeAligner):

 <?xml version="1.0" ?>
 <treealign>
 <head>
  <alignment-metadata>
    <date>Tue May  4 16:23:04 2010</date>
    <author>Lingua-Align</author>
  </alignment-metadata>
 </head>
  <treebanks>
    <treebank filename="treebanks/en/smultron_en_sophie.xml" id="en"/>
    <treebank filename="treebanks/sv/smultron_sv_sophie.xml" id="sv"/>
  </treebanks>
  <alignments>
    <align author="Lingua-Align" prob="0.11502659612149206125" type="fuzzy">
      <node node_id="s105_17" type="t" treebank_id="en"/>
      <node node_id="s109_23" type="t" treebank_id="sv"/>
    </align>
    <align author="Lingua-Align" prob="0.45281832125339427364" type="fuzzy">
      <node node_id="s105_34" type="t" treebank_id="en"/>
      <node node_id="s109_15" type="t" treebank_id="sv"/>
    </align>
  </alignments>
 </treealign>


=head2 OPTIONS

There is a number of options that can be specified on the command line.

=head3 Input options

=over

=item -a parallel-treebank-file

Name of the file that contains the parallel treebank. Default format is Stockholm Tree Aligner format (where the sentence alignment is implicitely given by tree node alignments). To use a different format use the option -A

=item -A format

Format of the parallel treebank/corpus. Default is sta (Stockholm Tree Aligner format). Other options are, for example, 'opus' (CES XML format as it is used in the OPUS corpus)

=item -s source-treebank-file

Name of the files that contains the source language treebank. This is useful to sepcify a file that is different from the one that is specified in the 'parallel-treebank-file'. For example, sentence alignment files from OPUS usually refer to non-parsed XML files. With -s we can overwrite this and refer to the parsed corpus instead. However, be aware that the same sentences have to be covered in the same order and appropriate IDs of these sentences have to be found when reading through the treebank files.

=item -S format

Format of the source language treebank. Default is TigerXML (which is used in the Stockholm Tree Aligner)

=item -t target-treebank-file

Name of the target language treebank file (similar to -s but for the target language)

=item -T format

Format of the target language treebank (similar to -S)

=item -w

Swap alignment direction when reading through the parallel treebank

=item -i

Try to align index nodes as well (used in AlpinoXML)

=back



=head3 Training options


Training will be enabled if a positive number of training sentences iss specified with the -n option OR the modelfile does not exist.


=over

=item -n nr_sent

Specify how many sentence (tree) pairs will be used for training a new tree-aligner model.

=item -f features

Define features to be used in training. (For alignment, features are taken from the modelfile.feat file!!) 'features' is a string with feature types separated by ':'. There are various features that can be used and combined. For more details look at L<Lingua::Align::Trees::Features>. The default is 'insideST2:insideTS2:outsideST2:outsideTS2'

=item -m model-file

Name of the file to store model parameters / read model parameters

=item -c classifier

Classifier to be used. Default is 'megam'. Another possiblity is 'clue' which refers to a noisy-or like classifier with independent precision-weighted features (requires probabilistic values for each feature and supports only positive features). Other classifiers may be supported in future releases of Lingua::Align.

=item -M moses-dir

Directory with the GIZA++ and Moses word alignment files that will be used for extracting certain features. Default is 'moses' and the treealigner expects to find files with the following names

 <moses-dir>/model/lex.0-0.e2f
 <moses-dir>/model/lex.0-0.f2e
 <moses-dir>/giza.src-trg/src-trg.A3.final.gz
 <moses-dir>/giza.trg-src/trg-src.A3.final.gz
 <moses-dir>/model/aligned.intersect

An alterantive way of specifying the location of word alignment files is to use the options (-d -D -g -G -y), see below.

=item -d lexe2f

Path to the probabilistic source-to-target lexicon created by Moses from the word aligned corpus. Of course, it could be any kind of bilingual dictionary as long as it provides a score for each entry and it uses the same format as the one created by Moses. Default is C<moses/model/lex.0-0.e2f>.

=item -D lexf2e

Similar to -d but for the target-to-source lexicon. Default is C<moses/model/lex.0-0.f2e>

=item -g giza.e2f.A3

Path to the Viterbi word alignment (source-to-target) created by GIZA++ (or other word aligners producing aligments in the same format). Default is C<moses/giza.trg-src/trg-src.A3.final.gz>.

=item -G giza.f2e.A3

Similar to -g but for the other alignment direction. Default = C<moses/giza.trg-src/trg-src.A3.final.gz>

=item -y symal-file

Path to the symmetrized word alignment format created by Moses (or other tools). Default = C<moses/model/aligned.intersect>


=item -I id-file

Name of the file that contains pairs of IDs for all sentences that have been word aligned with GIZA++/Moses. This is useful to match sentences when reading word alignment files for feature extraction (sometimes not all sentences are included in both, the parsed collection and the word aligned data!). Note that word aligments and parallel treebanks have still to be stored in the same order but sentences may be skipped if they do not appear in one of them. The format is like follows:

 ## source-file-name    target-file-name
 src-id1   trg-id1
 src-id2   trg-id2
 ....

The delimiter is one TAB character! n:m alignments are possible (IDs separated by spaces) but only 1:1 alignments will be used in the treealigner anyway.

=item -C

Switch on the linked-children feature (depending on the links between children nodes of the current node pair). This flag has to be specified in both, train and align mode!

=item -U

Switch on the linked-subtree-nodes feature (depending on the links between all descendent nodes of the current node pair). This flag has to be specified in both, train and align mode!

=item -P

Switch on the linked-parent feature (depending on the links between parent nodes of the current node pair). This flag has to be specified in both, train and align mode! This flag should NOT be used together with -U or -C!

=item -R iter

Use <iter> number of iterations for adaptive SEARN style learning. This is only useful in connection with (any of) the link depedency features from above (-C -U -P). Instead of learning from the given true link depedency feature extracted from the training data, this option will run the training several times and adjust these features acoording to the predicted link likelihoods from the previously trained classifier. This is currently very slow because it re-runs the feature extraction procedure (which should not be necessary when re-running the classifier). This should be improved later but the effect of SEARN seems to be very little anyway ....

=item -L

Align terminal nodes only (leaf nodes). It is possible to use this flag together with -N which then forces the aligner to align corresponding node types only (terminals with terminals and non-terminals with non-terminals)

=item -N

Align non-terminal nodes only. If specified together with -L: align corresponding nodes as explained above.


=item -1 weight

Training weight for good (sure) alignments
Default = 3

=item -2 weight

Training weight for fuzzy (possible) alignments
Default = 1

=item -3 weight

Training weight for negative examples (non-aligned nodes)
Default = 1

=item -4 weight

Training weight for weak alignments (new category in our Europarl data)
Default = 1


=item -k

Keep the feature file extracted for training which usually is removed to save storage space. The features are stored in __train.$$ (where $$ corresponds to the process ID)

=back


=head3 Alignment options

=over

=item -x threshold

Score threshold used for tree alignment. Node pairs obtaining scores below this threshold will not be considered in the alignment process.

=item -b strategy

Type of alignment strategy to be used. Default is 'inference' which refers to a two-step procedure with local classification in the first step and alignment inference in the second (see LinkSearch with argument -l). An alternative strategy is called 'bottom-up' in which the alignment is done in a greedy bottom-up fashion starting with leaf node pairs and going up to the root nodes. Nodes are linked immediately when the classification score (conditional link likelihood) exceeds the threshold (usually 0.5). Aligned nodes are removed from the search space. Therefore, only 1:1 links are returned. In a final step link likelihoods are used to align previously unlinked nodes with the selected alignment inference strategy in the same way as in the two-step procedure.

=item -l LinkSearch

Link strategy used to extract the node aligments after classification. Default strategy is 'greedy'. Other possible strategies are 'wellformed' (greedy + wellformedness criteria) and threshold (allow all links above the threshold score). You can also add the option 'final' (by adding the string '_final') to the selected strategy. In that case the aligner will first do the basic link search and then add links between nodes that obey the well-formedness criteria if either source or target language node is not linked yet. In other words, this final step makes 1:many links in the data that do not violate wellformedness. Yet another option is 'and' (which can be added as the string '_and' to the selected strategy, also in combination with '_final'). Using this option unlinked nodes (source and target) will be aligned in a last step in a greedy way even if they violate well-formedness. For example: 'wellformed_final_and' will force the aligner to, first, look for 1:1 links that are well-formed (multiple links are not allowed), then add well-formed links between nodes where one of them is already linked to another one, and, finally, adds links between still unlinked nodes.

=item -u

Switch to add-links mode (union). Existing links between nodes will be kept in the output file and new ones will be added. (In the default mode, existing links will be considered for evaluation only). This option is espcially useful if one wants to use a pipeline of alignments, for example, terminal node alignment first and non-terminal nodes in the next step.

=item -K

Similar to -u: switches to 'add-link' mode but now forces the aligner to use existing links to compete with the new ones. This means that the scores of existing links will be used in the link search algorithm applied for aligning tree nodes. This may also cause some existing links to disappear, for example, because they are not conform to the wellformedness criteria anymore.

=back

=head3 Runtime and other options

=over

=item -v

Verbose output

=item -O format

Output format (one of sta (=default) or dublin (= Dublin subtree aligner format)


=back



=head1 SEE ALSO

L<Lingua::Align::Trees>, L<Lingua::Align::Features>, L<Lingua::Align::Corpus>
 

=head1 AUTHOR

Joerg Tiedemann

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2009 by Joerg Tiedemann

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.8 or,
at your option, any later version of Perl 5 you may have available.

Copyright for MegaM by Hal Daume III
see http://www.cs.utah.edu/~hal/megam/ for more information
Paper: Notes on CG and LM-BFGS Optimization of Logistic Regression, 2004
http://www.cs.utah.edu/~hal/docs/daume04cg-bfgs.pdf

=cut