The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
#
#---------------------------------------------------------------------------
# Copyright (C) 2004 Jörg Tiedemann  <joerg@stp.ling.uu.se>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#---------------------------------------------------------------------------
#
# $Id$
# 
#
# usage: 
#
#

use strict;
use FindBin qw($Bin);
use lib "$Bin/../lib";

use Uplug::Data::Align;
use Uplug::Data;
use Uplug::IO::Any;
use Uplug::Config;
use Uplug::StrSim;


#---------------------------------------------------------------------------
# read config-file and check arguments

my %IniData=&GetDefaultIni;
my $IniFile='strsim.ini';
&CheckParameter(\%IniData,\@ARGV,$IniFile);

#---------------------------------------------------------------------------
# open input and output data streams

my ($InputStreamName,$InputStream)=                    # input stream
    each %{$IniData{'input'}};
my $SimStream=$IniData{output}{'string similarities'}; # output: string pairs

my $input=Uplug::IO::Any->new($InputStream);
my $strsim=Uplug::IO::Any->new($SimStream);

$input->open('read',$InputStream);
if (not $strsim->open('write',$SimStream)){exit;}

#---------------------------------------------------------------------------
# set script parameters

my $Param={};
$Param=$IniData{parameter};

my $freq=$IniData{parameter}{token}{'minimal frequency'};
my %MinFreq;
$MinFreq{source}=$IniData{parameter}{token}{'minimal frequency (source)'};
$MinFreq{target}=$IniData{parameter}{token}{'minimal frequency (target)'};
my %lang;
$lang{source}=$IniData{parameter}{token}{'language (source)'};
$lang{target}=$IniData{parameter}{token}{'language (target)'};
my %ExclStop;
$ExclStop{source}=$IniData{parameter}{token}{'exclude stop words (source)'};
$ExclStop{target}=$IniData{parameter}{token}{'exclude stop words (target)'};

my $PrintProgr=$IniData{parameter}{runtime}{'print progress'};
my $Buffer=$IniData{parameter}{runtime}{buffer};

my $sim_measure=$IniData{parameter}{'similarity measures'}{metrics};
my $precision=$IniData{parameter}{'similarity measures'}{precision};
my $use_weights=$IniData{parameter}{'similarity measures'}{'use weights'};
my $use_nm_weights=
    $IniData{parameter}{'similarity measures'}{'use not-matching-weights'};
my $use_Ngrams=1;
if (defined $IniData{parameter}{'similarity measures'}{'use N-grams'}){
    my $use_Ngrams=$IniData{parameter}{'similarity measures'}{'use N-grams'};
}
my $MinScore=$IniData{parameter}{'similarity measures'}{'minimal score'};
my $rmLinked=$IniData{parameter}{token}{'remove linked'};

my $delimiter='';

#---------------------------------------------------------------------------

if ($PrintProgr){
    print STDERR "read alignments\n";
}

#---------------------------------------------------------------------------
# main part
#   0) initialize some variables

my $count=0;
my $AlignCount=0;
my %SimScores;
my %SrcFreq;
my %TrgFreq;
my ($TotalCooc,$TotalSrc,$TotalTrg);
my %weights;
my (%SrcNMngrams,%TrgNMngrams);

my ($nrSrc,$nrTrg);
my $nrTotal=0;
my $nrPairs=0;

my $data=Uplug::Data::Align->new($lang{source},$lang{target});


#---------------------------------------------------------------------
# 1) loop: read all bitext segments and calculate similarities between
#          source and target language items (words and MWUs)

my $time=time();
while ($input->read($data)){
    $AlignCount++;
    if ($rmLinked){
	$data->rmLinkedToken($data);
    }

    #---------------------------------------------------------------------
    # verbose-mode:
    #
    if ($PrintProgr){
	if (not ($AlignCount % 100)){
	    #
	    # print info each 100 bitext segments:
	    #    nr-of-segments (time: processing-time in seconds,
	    #                    nr-of-source-items:nr-of-target-items ->
	    #                    nr-of-saved-pairs/nr-of-total-pairs)
	    #
	    # nr-of-saved-pairs: pairs that have been saved in the score-hash
	    # nr-of-total-pairs: nr of pairs in the last 100 bitext segments
	    #
	    $|=1;print STDERR "$AlignCount segments (time: ";
	    print STDERR time()-$time;
	    print STDERR " sec, $nrSrc:$nrTrg -> $nrPairs/$nrTotal)\n";
	    $|=0;
	    $nrSrc=0;$nrTrg=0;$nrTotal=0;$nrPairs=0;
	}
	if (not ($AlignCount % 10)){
	    $|=1;print STDERR '.';$|=0;
	}
    }
    #---------------------------------------------------------------------

    my (%Ngrams,%NgramPos);
    my @SrcNodes=();
    my @TrgNodes=();
    @{$Ngrams{source}}=$data->getSrcPhrases($$Param{token},\@SrcNodes);
    @{$Ngrams{target}}=$data->getTrgPhrases($$Param{token},\@TrgNodes);

    $nrSrc+=$#{$Ngrams{source}};
    $nrTrg+=$#{$Ngrams{target}};

    foreach my $src (@{$Ngrams{source}}){
	if ($ExclStop{source} and $data->{source}->isStopWord($src)){next;}
	foreach my $trg (@{$Ngrams{target}}){

	    $nrTotal++;
	    if (defined $SimScores{$src}{$trg}){next;}
	    if ($data->lengthQuotient($src,$trg)<$MinScore){next;}
	    if ($ExclStop{target} and $data->{target}->isStopWord($trg)){next;}
	    if (not $data->checkPairParameter($src,$trg,$$Param{token})){next;}

	    $count++;
	    if ($Buffer and ($count>$Buffer)){
		&WritePairs($strsim,\%SimScores,$MinScore,$PrintProgr);
		$count=0;
	    }

#-----------------------------------------------------------------------------
# everything's ok!
# --> calculate similarity scores!
#-----------------------------------------------------------------------------

	    my $score;
	    if ($use_weights){
		$score=&similar($src,$trg,$sim_measure,1,\%weights,$delimiter);
	    }
	    elsif($use_nm_weights){
		$score=&WeightedSimilarity($src,$trg,\%weights,
					   \%SrcNMngrams,\%TrgNMngrams);
	    }
	    else{
		$score=&similar($src,$trg,$sim_measure,1,undef,$delimiter);
	    }

	    if ($precision){                               # truncate scores
		$score=int($score*10**$precision+0.5)/     # (precision=
		    (10**$precision);                      #   number of dec)
	    }

	    $nrPairs++;
	    $SimScores{$src}{$trg}=$score;
	}
    }
}
$input->close;

#---------------------------------------------------------------------
# write pairs with scores to the database

&WritePairs($strsim,\%SimScores,$MinScore,$PrintProgr);

$input->close;
my %header = ('align count' => $AlignCount,             # save some counts
	      'token pair count' => $TotalCooc);        # in the header
$strsim->addheader(\%header);
$strsim->addheader($$Param{token});
$strsim->writeheader;
$strsim->close;


# end of main
#---------------------------------------------------------------------




sub WeightedSimilarity{
    my ($src,$trg,$weights,$SrcNMngrams,$TrgNMngrams)=@_;
    my %non;
    my ($score)=&GetNonMatches($src,$trg,\%non);

    my ($s,$t);
    my $oldsc=$score;
    foreach $s (keys %non){
	foreach $t (keys %{$non{$s}}){
	    my $pat='';
	    if ((ref($SrcNMngrams) eq 'HASH')and(ref($TrgNMngrams) eq 'HASH')){
		$pat='#';
		my @NonSrc=&GetSubStrings($s,$SrcNMngrams,
					  undef,20,'');
		$s=join('#',@NonSrc);
		my @NonTrg=&GetSubStrings($t,$TrgNMngrams,
					  undef,20,'');
		$t=join('#',@NonTrg);
	    }
	    $score+=&LCS($s,$t,$weights,$pat);
#	  if ($score>$oldsc){
#	      print "-($s|$t)-";
#	  }
	}
    }
#	  if ($score>$oldsc){
#	     print "\t\t$oldsc -> $score ($SrcNgram[$src]|$TrgNgram[$trg])\n";
#	  }
    if (length($src)>
	length($trg)){
	$score/=length($src);
    }
    else{
	$score/=length($trg);
    }
    return $score;
}

sub WritePairs{
    my ($stream,$SimScore,$MinScore,$PrintProgr)=@_;
    if ($PrintProgr){
	print STDERR "write scores\n";
    }
    my ($src,$trg);
    my $data=Uplug::Data->new;
    foreach $src (keys %{$SimScore}){
	foreach $trg (keys %{$$SimScore{$src}}){
	    if ($MinScore){
		if ($$SimScore{$src}{$trg}<$MinScore){next;}
	    }
	    $data->init();
	    $data->setAttribute('source',$src);
	    $data->setAttribute('target',$trg);
	    $data->setAttribute('score',$$SimScore{$src}{$trg});
	    $stream->write($data);
	    delete $$SimScore{$src}{$trg};
	}
	if (not keys %{$$SimScore{$src}}){
	    delete $$SimScore{$src};
	}
    }
}


sub GetDefaultIni{

    my $DefaultIni = {
  'module' => {
    'program' => 'strsim.pl',
    'location' => '$UplugBin',
    'name' => 'LCSR - the longest common sub-sequence ratio',
#    'stdin' => 'bitext',
  },
  'description' => 'The longest common sub-sequence ratio is
  calculated for co-occurring words and chunks.',
  'input' => {
    'bitext' => {
	'format' => 'xces align',
    }
  },
  'output' => {
    'string similarities' => {
      'stream name' => 'string similarities',
    },
  },
  'parameter' => {
    'token' => {
      'chunks (source)' => 'c.*',            # use chunks
      'chunks (target)' => 'c.*',            # use chunks
#      'minimal length diff' => 0.3,
#     'matching word class' => 'same',       # don't mix content and stop words
      'minimal frequency' => 1,
      'minimal frequency (source)' => 1,
      'minimal frequency (target)' => 1,
      'minimal length (source)' => 3,
      'minimal length (target)' => 3,
      'maximal ngram length (source)' => 1,  # >1 --> use N-grams
      'maximal ngram length (target)' => 1,  # >1 --> use N-grams
#      'use attribute (source)' => 'none',
#      'use attribute (target)' => 'none',
#      'grep token (source)' => 'contains alphabetic',
#      'grep token (target)' => 'contains alphabetic',
      'lower case (source)' => 1,
      'lower case (target)' => 1,
      'exclude stop words (source)' => 1,
      'exclude stop words (target)' => 1,
#      'language (source)' => 'english',
#      'language (target)' => 'swedish',
      'language (source)' => 'default',
      'language (target)' => 'default',
      'delimiter' => '\\s+',
      'token label' => 'w',
      'remove linked' => 0,
    },
    'similarity measures' => {
      'minimal score' => 0.4,
      'use not-matching-weights' => 0,
      'use N-grams' => 0,
      'metrics' => 'lcsr',
#      'precision' => 4,
      'use weights' => 0,
    },
    'runtime' => {
      'print progress' => 1,
      'buffer' => 2000000,
    },
  },
  'arguments' => {
    'shortcuts' => {
       'in' => 'input:bitext:format',
       'infile' => 'input:intext:file',
       'informat' => 'input:intext:format',
       'out' => 'output:string similarities:file',
       'outformat' => 'output:string similarities:format',
       'srclang' => 'parameter:token:language (source)',
       'trglang' => 'parameter:token:language (target)',
    }
  },
  'widgets' => {
  }
};

    return %{$DefaultIni};
}