#!/usr/bin/env perl
#
# coocstat.pl
#
#---------------------------------------------------------------------------
# Copyright (C) 2004 Jörg Tiedemann <joerg@stp.ling.uu.se>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#---------------------------------------------------------------------------
#
# $Id$
#
# usage: coocstat.pl [OPTIONS]
#
#
# default parameters are given in the &GetDefaultIni subfunction
# at the end of the script!
#
use strict;
use FindBin qw($Bin);
use lib "$Bin/../lib";
use Uplug::Data;
use Uplug::Data::Align;
use Uplug::IO::Any;
use Uplug::Config;
use Uplug::CoocStat;
my %IniData=&GetDefaultIni;
my $IniFile='coocstat.ini';
&CheckParameter(\%IniData,\@ARGV,$IniFile);
#---------------------------------------------------------------------------
if (ref($IniData{input}) ne 'HASH'){die "# coocstat.pl: no input found!\n";}
my $CoocStream=$IniData{input}{'cooc freq'};
my $SrcStream=$IniData{input}{'source freq'};
my $TrgStream=$IniData{input}{'target freq'};
if (ref($IniData{output}) ne 'HASH'){die "# coocstat.pl: no output found!\n";}
my ($StatStreamName,$StatStream)=each %{$IniData{output}};
my $coocfreq=Uplug::IO::Any->new($CoocStream);
my $srcfreq=Uplug::IO::Any->new($SrcStream);
my $trgfreq=Uplug::IO::Any->new($TrgStream);
my $coocstat=Uplug::IO::Any->new($StatStream);
#---------------------------------------------------------------------------
$coocfreq->open('read',$CoocStream);
$srcfreq->open('read',$SrcStream);
$trgfreq->open('read',$TrgStream);
my $header=$coocfreq->header;
$coocstat->addheader($header);
$coocstat->open('write',$StatStream);
$coocstat->writeheader;
my $header=$srcfreq->header;
my $SrcCount=$header->{'token count'};
my $header=$trgfreq->header;
my $TrgCount=$header->{'token count'};
#---------------------------------------------------------------------------
# set module parameters (from IniData)
my ($measure,$precision,$MinScore); # statistics
my ($freq,$LenDiff,$ClassMatch); # thresholds
my %length; # token length thresholds (source+target)
my %MinFreq; # minimal frequency (source+target)
my %GrepTok; # restrict string types (source+target)
my %lang; # language (source+target)
my $PrintProgr; # verbose-mode
if (ref($IniData{parameter}) eq 'HASH'){
if (ref($IniData{parameter}{'co-occurrence'}) eq 'HASH'){
$precision=$IniData{'parameter'}{'co-occurrence'}{'precision'};
$MinScore=$IniData{'parameter'}{'co-occurrence'}{'minimal score'};
$measure=$IniData{'parameter'}{'co-occurrence'}{'measure'};
}
if (ref($IniData{parameter}{'token pair'}) eq 'HASH'){
$freq=$IniData{'parameter'}{'token pair'}{'minimal frequency'};
$LenDiff=$IniData{'parameter'}{'token pair'}{'minimal length diff'};
$ClassMatch=$IniData{'parameter'}{'token pair'}{'matching word class'};
}
if (ref($IniData{parameter}{'source token'}) eq 'HASH'){
$length{source}=
$IniData{'parameter'}{'source token'}{'minimal length'};
$MinFreq{source}=
$IniData{'parameter'}{'source token'}{'minimal frequency'};
$lang{source}=$IniData{'parameter'}{'source token'}{'language'};
$GrepTok{source}=$IniData{'parameter'}{'source token'}{'grep token'};
}
if (ref($IniData{parameter}{'target token'}) eq 'HASH'){
$length{target}=
$IniData{'parameter'}{'target token'}{'minimal length'};
$MinFreq{target}=
$IniData{'parameter'}{'target token'}{'minimal frequency'};
$GrepTok{target}=$IniData{'parameter'}{'target token'}{'grep token'};
$lang{target}=$IniData{'parameter'}{'target token'}{'language'};
}
if (ref($IniData{parameter}{runtime}) eq 'HASH'){
$PrintProgr=$IniData{'parameter'}{runtime}{'print progress'};
}
}
my $stat=Uplug::CoocStat->new($measure);
if (not ref($stat)){die "# coocstat.pl: cannot find '$measure'!\n";}
#---------------------------------------------------------------------------
my $header=$coocfreq->header;
my $AlignCount=$header->{'align count'};
my $PairCount=$header->{'token pair count'};
#---------------------------------------------------------------------------
# create instances of data objects
my $TreeData=Uplug::Data::Align->new($lang{source}, # new alignment
$lang{target}); # data object
my $found=Uplug::Data->new; # search results
my $OutData=Uplug::Data->new; # output data
#---------------------------------------------------------------------------
# main: read frequency files and compute scores
my $count=0;
my %CoocStats;
if ($PrintProgr){print STDERR "read frequencies and calculate '$measure'\n";}
while ($coocfreq->read($TreeData)){
my $data=$TreeData->attribute;
if ($freq and ($data->{freq}<$freq)){next;}
for ('source','target'){
if ($length{$_}){
if (length($data->{$_})<$length{$_}){next;}
}
if ($GrepTok{$_}){
if (not $TreeData->{$_}->isStringType($data->{$_},$GrepTok{$_})){
next;
}
}
}
#------------------------------------------------------
# check length difference ratio if necessary
if ($LenDiff){
if ($TreeData->lengthQuotient($data->{source},
$data->{target})<$LenDiff){next;}
}
#------------------------------------------------------
# check token classes if necessary
if ($ClassMatch){
if (not $TreeData->isSameType($lang{source},$lang{target},
$data->{source},$data->{target},
$ClassMatch)){next;}
}
#------------------------------------------------------
# look for source and target token frequencies
# (this makes it slow ...:)
my %search=('token' => $data->{source}); # source token
$found->init();
$srcfreq->select($found,\%search);
my $tokfreq=$found->attribute;
if ($MinFreq{source} and ($tokfreq->{freq}<$MinFreq{source})){next;}
$data->{'srcfreq'}=$tokfreq->{freq};
my %search=('token' => $data->{target}); # target token
$found->init();
$trgfreq->select($found,\%search);
my $tokfreq=$found->attribute;
if ($MinFreq{target} and ($tokfreq->{freq}<$MinFreq{target})){next;}
$data->{'trgfreq'}=$tokfreq->{freq};
#------------------------------------------------------
$count++;
if ($PrintProgr){
if (not ($count % 500)){
$|=1;print STDERR '.';$|=0;
}
if (not ($count % 10000)){
$|=1;print STDERR "$count pairs\n";$|=0;
}
}
#------------------------------------------------------
#------------------------------------------------------
# finally: compute the score!
my $score=$stat->compute($data->{freq},
$data->{srcfreq},
$data->{trgfreq},
$PairCount);
if ($precision){
$score=int($score*10**$precision+0.5)/(10**$precision);
}
if ($MinScore){
if ($score<$MinScore){next;}
}
#------------------------------------------------------
# save score in output
$OutData->init();
$OutData->setAttribute('source',$data->{source});
$OutData->setAttribute('target',$data->{target});
$OutData->setAttribute('score',$score);
$coocstat->write($OutData);
}
$coocfreq->close;
$srcfreq->close;
$trgfreq->close;
$coocstat->close;
# end of main
#---------------------------------------------------------------------------
sub GetDefaultIni{
my $DefaultIni =
{
'module' => {
'program' => 'coocstat.pl',
'location' => '$UplugBin',
'name' => 'Dice coefficient',
},
'description' => 'This module calculates Dice scores from
co-occurrence counts.',
'input' => {
'cooc freq' => {
'stream name' => 'cooc freq',
},
'source freq' => {
'stream name' => 'source freq',
},
'target freq' => {
'stream name' => 'target freq',
},
},
'output' => {
'dice' => {
'stream name' => 'dice',
},
},
'parameter' => {
'token pair' => {
'minimal frequency' => 2,
# 'minimal length diff' => 0.5,
# 'matching word class' => 'same'
},
'source token' => {
'minimal frequency' => 2,
# 'minimal length' => 4,
# 'grep token' => 'contains alphabetic',
# 'language' => 'default',
# 'lower case' => 1,
},
'target token' => {
'minimal frequency' => 2,
# 'minimal length' => 4,
# 'grep token' => 'contains alphabetic',
# 'language' => 'default',
# 'lower case' => 1
},
'co-occurrence' => {
'minimal score' => 0.2,
'measure' => 'dice',
# 'precision' => 4,
},
'runtime' => {
'print progress' => 1,
},
},
'arguments' => {
'shortcuts' => {
'src' => 'input:source freq:file',
'trg' => 'input:target freq:file',
'cooc' => 'input:cooc freq:file',
'stat' => 'output:cooc stat:file',
's' => 'parameter:co-occurrence:measure',
'm' => 'parameter:co-occurrence:minimal score',
'min' => 'parameter:co-occurrence:minimal score',
}
},
'widgets' => {
}
};
return %{$DefaultIni};
}