The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
#
# coocfreq.pl: count token frequencies
#
#---------------------------------------------------------------------------
# Copyright (C) 2004 Jörg Tiedemann  <joerg@stp.ling.uu.se>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#---------------------------------------------------------------------------
#
# $Id$
#
# usage: 
#
#

use strict;
use FindBin qw($Bin);
use lib "$Bin/../lib";

use Uplug::IO::Any;
use Uplug::Data;
use Uplug::Data::Align;
use Uplug::Config;

my %IniData=&GetDefaultIni;
my $IniFile='coocfreq.ini';
&CheckParameter(\%IniData,\@ARGV,$IniFile);

#---------------------------------------------------------------------------

my ($InputStreamName,$InputStream)=             # input data (only one!)
    each %{$IniData{'input'}};
my $SrcStream=$IniData{output}{'source freq'};  # source frequencies
my $TrgStream=$IniData{output}{'target freq'};  # target frequencies
my $CoocStream=$IniData{output}{'cooc freq'};   # co-occurrence frequencies

my $input=Uplug::IO::Any->new($InputStream);
my $srcfreq=Uplug::IO::Any->new($SrcStream);
my $trgfreq=Uplug::IO::Any->new($TrgStream);
my $coocfreq=Uplug::IO::Any->new($CoocStream);

#---------------------------------------------------------------------------

$input->open('read',$InputStream);
$srcfreq->open('write',$SrcStream);
$trgfreq->open('write',$TrgStream);
if (not $coocfreq->open('write',$CoocStream)){exit;}

#---------------------------------------------------------------------------

my $Param={};
$Param=$IniData{parameter};

my $MinPairFreq=$IniData{parameter}{token}{'minimal frequency'};
my %MinFreq;
$MinFreq{source}=$IniData{parameter}{token}{'minimal frequency (source)'};
$MinFreq{target}=$IniData{parameter}{token}{'minimal frequency (target)'};
my %lang;
$lang{source}=$IniData{parameter}{token}{'language (source)'};
$lang{target}=$IniData{parameter}{token}{'language (target)'};
my %ExclStop;
$ExclStop{source}=$IniData{parameter}{token}{'exclude stop words (source)'};
$ExclStop{target}=$IniData{parameter}{token}{'exclude stop words (target)'};
my $rmLinked=$IniData{parameter}{token}{'remove linked'};

my $PrintProgr=$IniData{'parameter'}{'runtime'}{'print progress'};
my $Buffer=$IniData{'parameter'}{'runtime'}{'buffer'};
my $SrcBuffer=$IniData{'parameter'}{'runtime'}{'source buffer'};
my $TrgBuffer=$IniData{'parameter'}{'runtime'}{'target buffer'};
my $MaxSegments=$IniData{'parameter'}{'runtime'}{'max nr of segments'};
my $CleanBuffer=$IniData{'parameter'}{'runtime'}{'clean buffer'};
# my $CleanBuffer=1;

my %First;

my $TmpBuffer;
my $BufferPuffer=$Buffer/10;      # keep at least 10% of the buffer size
                                  # flush the buffer otherwise!

#---------------------------------------------------------------------------

if ($PrintProgr){
    print STDERR "read alignments\n";
}

my %CoocFreq;      # token pair frequencies
my %SrcFreq;       # source token frequencies
my %TrgFreq;       # target token frequencies

my $AlignCount=0;                         # bitext segment counter
my ($CoocTypes,$SrcTypes,$TrgTypes);      # type counter
my ($CoocTokens,$SrcTokens,$TrgTokens);   # token counter
my $HashTypes;                            # type counter (buffer management)



#---------------------------------------------------------------------------
# now: read from input and count!

my $data=Uplug::Data::Align->new($lang{source},$lang{target});
# my $data=Uplug::Data->new('align',$lang{source},$lang{target});

my $time=time();
while ($input->read($data)){
    $AlignCount++;

    if ($rmLinked){$data->rmLinkedToken($data);}

    if ($PrintProgr){
	if (not ($AlignCount % 100)){
	    $|=1;
	    print STDERR "$AlignCount segments (";
	    print STDERR time()-$time;
	    print STDERR " sec, $SrcTypes/$SrcTokens:$TrgTypes/$TrgTokens:$CoocTypes/$CoocTokens)\n";
	    $|=0;
	    if ($MaxSegments){
		if ($AlignCount>$MaxSegments){last;}
	    }
	}
    }


    my %SrcNgrams=();my %TrgNgrams=();
    $data->getAlignPhrases($$Param{token},\%SrcNgrams,\%TrgNgrams);

    #---------------------------------------
    # count frequencies
    #  0) frequencies in the bigram segment
    #  1) add bigram segment frequencies to the total counts
    {
	my %SentFreq;
	&CountSentFreq(\%SentFreq,\%SrcNgrams,\%TrgNgrams);

	# target token frequencies

	foreach my $trg (keys %{$SentFreq{trg}}){
	    if ($ExclStop{target} and $data->{target}->isStopWord($trg)){next;}
	    if (not defined $TrgFreq{$trg}){$TrgTypes++;}
	    $TrgFreq{$trg}+=$SentFreq{trg}{$trg};
	    $TrgTokens+=$SentFreq{trg}{$trg};
	    if ($TrgBuffer and (not ($TrgTypes % $TrgBuffer))){
		&WriteFreq($trgfreq,\%TrgFreq,$MinFreq{target},$PrintProgr);
	    }
	}

	# source frequencies

	foreach my $s (keys %{$SentFreq{cooc}}){
	    if ($ExclStop{source} and $data->{source}->isStopWord($s)){next;}
	    if (not defined $SrcFreq{$s}){$SrcTypes++;}
	    $SrcFreq{$s}+=$SentFreq{src}{$s};
	    $SrcTokens+=$SentFreq{src}{$s};
	    if ($SrcBuffer and (not ($SrcTypes % $SrcBuffer))){
		&WriteFreq($srcfreq,\%SrcFreq,$MinFreq{source},$PrintProgr);
	    }

	    # co-occurrence frequencies

	    foreach my $t (keys %{$SentFreq{cooc}{$s}}){
		my $src=$s;
		my $trg=$t;

		#---------------------------
		# relative position feature is special!
		# count marginal frequencies for each pair!
		
		if ($$Param{token}{'relative position'}){
		    ($src,$trg)=&makeRelPosFeature($src,$trg);
		    if (not defined $TrgFreq{$trg}){$TrgTypes++;}
		    $TrgFreq{$trg}+=$SentFreq{trg}{$t};
		    $TrgTokens+=$SentFreq{trg}{$t};
		    if ($TrgBuffer and (not ($TrgTypes % $TrgBuffer))){
			&WriteFreq($trgfreq,\%TrgFreq,$MinFreq{target},
				   $PrintProgr);
		    }
		    if (not defined $SrcFreq{$src}){$SrcTypes++;}
		    $SrcFreq{$src}+=$SentFreq{src}{$s};
		    $SrcTokens+=$SentFreq{src}{$s};
		    if ($SrcBuffer and (not ($SrcTypes % $SrcBuffer))){
			&WriteFreq($srcfreq,\%SrcFreq,$MinFreq{source},
				   $PrintProgr);
		    }
		}
		#---------------------------

		if ($ExclStop{target} and $data->{target}->IsStopWord($trg)){
		    next;
		}

		if ($data->checkPairParameter($src,$trg,$$Param{token})){

		    if (not defined $CoocFreq{$src}){
			$CoocTypes++;$HashTypes++;
		    }
		    elsif (not defined $CoocFreq{$src}{$trg}){
			$CoocTypes++;$HashTypes++;
		    }
		    $CoocFreq{$src}{$trg}+=$SentFreq{cooc}{$s}{$t};
		    $CoocTokens+=$SentFreq{cooc}{$s}{$t};

		    #----------------------------------------------
		    # check the buffer! if overflow:
		    #    alt 1) clean the buffer from low frequency pairs
		    #    alt 2) write some pairs or flush the buffer

		    if ($Buffer and (not ($HashTypes % $Buffer))){
			if ($CleanBuffer){
			    &CleanBuffer(\%CoocFreq,$MinPairFreq,$PrintProgr);
			}
			else{
			    my $written=&WritePairs($coocfreq,\%CoocFreq,
						    $MinPairFreq,$PrintProgr);
			    if ($written<$BufferPuffer){
				&FlushBuffer(\%CoocFreq,$PrintProgr);
				$HashTypes=0;
			    }
			}
		    }
		    #----------------------------------------------
		}
	    }
	}
    }
}
$input->close;

# end of input file
#---------------------------------------------------------------------------
# write frequencies to files

&WritePairs($coocfreq,\%CoocFreq,$MinPairFreq,$PrintProgr);
&WriteFreq($srcfreq,\%SrcFreq,$MinFreq{source},$PrintProgr);
&WriteFreq($trgfreq,\%TrgFreq,$MinFreq{target},$PrintProgr);

if(-e $TmpBuffer){&FlushBuffer(\%CoocFreq,$PrintProgr);}
&RestoreFlushedBuffer($coocfreq,$MinPairFreq,
		      $srcfreq,$trgfreq,$PrintProgr);

#-----------------------------------------
# write headers to frequency-files

$input->close;
my %header = ('align count' => $AlignCount,
	      'token pair count' => $CoocTokens,
	      'type pair count' => $CoocTypes);
$coocfreq->addheader(\%header);
$coocfreq->addheader($$Param{token});
$coocfreq->writeheader;
$coocfreq->close;
my %header = ('type count' => $SrcTypes,
	      'token count' => $SrcTokens);
foreach (keys %{$$Param{token}}){
    if (/source/){
	my $attr=$_;
	$attr=~s/ (source)//;
	$header{$attr}=$$Param{token}{$_};
    }
}
$srcfreq->addheader(\%header);
$srcfreq->writeheader;
$srcfreq->close;
my %header = ('type count' => $TrgTypes,
	      'token count' => $TrgTokens);
foreach (keys %{$$Param{token}}){
    if (/target/){
	my $attr=$_;
	$attr=~s/ (target)//;
	$header{$attr}=$$Param{token}{$_};
    }
}

$trgfreq->addheader(\%header);
$trgfreq->writeheader;
$trgfreq->close;

# end of main
#---------------------------------------------------------------------------








#---------------------------------------------------------------------------
# CountSentFreq
#
# count tokens and token pairs in a sentence pair
#     - one type / sentence pair OR
#     - all tokens/token pairs

sub CountSentFreq{
    my ($freq,$src,$trg)=@_;
    foreach my $t (values %{$trg}){
#	$$freq{trg}{$t}=1;                # count trg only once/sentence
	$$freq{trg}{$t}++;                # OR count all trg-tokens
    }
    foreach my $s (values %{$src}){
#	$$freq{src}{$s}=1;                # count src only once/sentence
	$$freq{src}{$s}++;                # OR count all src-tokens
	foreach my $t (values %{$trg}){
#	    $$freq{cooc}{$s}{$t}=1;       # count all src-trg pairs only once
	    $$freq{cooc}{$s}{$t}++;       # OR count all src-trg pairs

#-------------------------------------
# coocfreq <= min (srcfreq,trgfreq)

	    if ($$freq{cooc}{$s}{$t}>$$freq{src}{$s}){
		$$freq{cooc}{$s}{$t}=$$freq{src}{$s};
	    }
	    if ($$freq{cooc}{$s}{$t}>$$freq{trg}{$t}){
		$$freq{cooc}{$s}{$t}=$$freq{trg}{$t};
	    }

#-------------------------------------
# coocfreq <= max (srcfreq,trgfreq)
#
#	    if (($$freq{cooc}{$s}{$t}>$$freq{src}{$s}) and
#		($$freq{cooc}{$s}{$t}>$$freq{trg}{$t})){
#		if ($$freq{trg}{$t}>$$freq{src}{$s}){
#		    $$freq{cooc}{$s}{$t}=$$freq{trg}{$t};
#		}
#		else{
#		    $$freq{cooc}{$s}{$t}=$$freq{src}{$s};
#		}
#	    }
	}
    }
}



#---------------------------------------------------------------------------
# WritePairs: save co-occurrence frequencies

sub WritePairs{
    my ($coocfreq,$CoocFreq,$MinPairFreq,$PrintProgr)=@_;
    my ($src,$trg);
    if ($PrintProgr){
	my $nrSRC=scalar keys %{$CoocFreq};
	print STDERR "write pairs ($nrSRC source items)\n";
    }
    my $written=0;
    my $nrSRC=0;
    foreach $src (keys %{$CoocFreq}){
	$nrSRC++;
	foreach $trg (keys %{$$CoocFreq{$src}}){
	    my $total=$$CoocFreq{$src}{$trg};
	    my $freq=$total;
	    if ($total<$MinPairFreq){next;}

#	    my $data=Uplug::Data::Tree->new;
	    my $data=Uplug::Data->new();
	    $data->setData({source => $src,
			    target => $trg,
			    freq => $freq});
#	    $data->setAttribute('source',$src);
#	    $data->setAttribute('target',$trg);
#	    $data->setAttribute('freq',$freq);      # DBM adds up frequencies!
	    $written++;
	    if ($PrintProgr){
		if (not ($written % 1000)){
		    $|=1;print STDERR '.';$|=0;
		}
		if (not ($written % 50000)){
		    $|=1;print STDERR "$written saved ($nrSRC)\n";$|=0;
		}
	    }
	    $coocfreq->write($data);
	    $HashTypes--;
	    delete $$CoocFreq{$src}{$trg};
	}
	if (not keys %{$$CoocFreq{$src}}){
	    delete $$CoocFreq{$src};
	}
    }
    if ($PrintProgr){
	$|=1;print STDERR "$written saved ($nrSRC)\n";$|=0;
    }
    return $written;
    if ($written<$BufferPuffer){
	&FlushBuffer($CoocFreq,$PrintProgr);
	$HashTypes=0;
    }
    $First{$coocfreq}=1;
}

#---------------------------------------------------------------
# CleanBuffer: take away all pairs with the lowest frequencies
# (assuming that they don't occur very much in the future either)

sub CleanBuffer{
    my ($CoocFreq,$minfreq,$verbose)=@_;
    my $nr=0;
    my $freq=1;
    if ($verbose){print STDERR "clean buffer ... ";}
    while (($freq<$minfreq) and ($nr<$BufferPuffer) and (keys %{$CoocFreq})){
	foreach my $src (keys %{$CoocFreq}){
	    foreach my $trg (keys %{$$CoocFreq{$src}}){
		if ($$CoocFreq{$src}{$trg}==$freq){
		    $HashTypes--;
		    delete $$CoocFreq{$src}{$trg};
		    $nr++;
		}
	    }
	    if (not keys %{$$CoocFreq{$src}}){
		delete $$CoocFreq{$src};
	    }
	}
	$freq++;
    }
    $freq--;
    if ($verbose){print STDERR "$nr pairs deleted (freq $freq)!\n";}
    if ($nr<$BufferPuffer){&FlushBuffer($CoocFreq,$verbose);}
}

#---------------------------------------------------------------
# flush buffer to temp file

sub FlushBuffer{
    my ($CoocFreq,$PrintProgr)=@_;
    if (not defined $TmpBuffer){
	$TmpBuffer=Uplug::IO::Any::GetTempFileName;
	open F,"| gzip -c >$TmpBuffer.gz";
    }
    else{
	open F,"| gzip -c >>$TmpBuffer.gz";
    }
    if ($PrintProgr){
	$|=1;print STDERR "flushing buffer to $TmpBuffer.gz!\n";$|=0;
    }
    my $nr=0;
    foreach my $src (keys %{$CoocFreq}){
	$nr++;
	foreach my $trg (keys %{$$CoocFreq{$src}}){
	    print F "$src\x00$trg\x00$$CoocFreq{$src}{$trg}\n";
	    delete $$CoocFreq{$src}{$trg};
	}
	delete $$CoocFreq{$src};
    }
    close F;
}


#----------------------------------------------------------------------
# read data from the flushed buffer and write data to DB
#----------------------------------------------------------------------


sub RestoreFlushedBuffer{
    my ($coocfreq,$minfreq,$srcfreq,$trgfreq,$progress)=@_;
    if (not -e $TmpBuffer){return;}

    my %src=();
    my %trg=();

    my %skipsrc=();
    my %skiptrg=();

    my $nrSrc=0;
    my $nrTrg=0;
    my $skipped=0;
    my $written;
    my $count=0;

    #----------------------------------------------------------------------
    if ($progress){
	$|=1;print STDERR "read buffer\n";$|=0;
    }
    #----------------------------------------------------------------------

    open F,"gzip -cd <$TmpBuffer.gz |";
    while (<F>){
	$count++;

	if ($progress){
	    if (not ($count % 5000)){
		$|=1;print STDERR '.';$|=0;
	    }
	    if (not ($count % 100000)){
		$|=1;print STDERR "$written saved, $skipped skipped ($count)\n";$|=0;
	    }
	}

	chomp;
	my ($s,$t,$f)=split(/\x00/);

	if (($nrSrc<$SrcBuffer) and (not defined $src{$s})){
	    $src{$s}=&GetTokFreq($srcfreq,$s);
	    $nrSrc++;
	}
	if (($nrTrg<$TrgBuffer) and (not defined $trg{$t})){
	    $trg{$t}=&GetTokFreq($trgfreq,$t);
	    $nrTrg++;
	}
	my ($sf,$tf);
	if (defined $src{$s}){$sf=$src{$s};}
	else{$sf=&GetTokFreq($srcfreq,$s);}
	if (defined $trg{$t}){$tf=$trg{$t};}
	else{$tf=&GetTokFreq($trgfreq,$t);}

	if ($sf<$minfreq){$skipped++;next;}
	if ($tf<$minfreq){$skipped++;next;}

	my $total=$f+&GetPairFreq($coocfreq,$s,$t);
	if ($total<$minfreq){
	    $skipsrc{$s}+=$f;
	    $skiptrg{$t}+=$f;
	    next;
	}
	else{
#	    my $data=Uplug::Data::Tree->new;
	    my $data=Uplug::Data->new();
	    $data->setAttribute('source',$s);
	    $data->setAttribute('target',$t);
	    $data->setAttribute('freq',$f);      # DBM adds up frequencies!
	    $written++;
	    $coocfreq->write($data);
	}
    }
    close F;

    #----------------------------------------------------------------------
    if ($progress){
	$|=1;print STDERR "\nread remaining buffer\n";$|=0;
    }
    #----------------------------------------------------------------------

    my %freq=();
    my $count=0;
    open F,"gzip -cd <$TmpBuffer.gz |";
    while (<F>){
	chomp;
	my ($s,$t,$f)=split(/\x00/);
	if (not defined $skipsrc{$s}){next;}
	if (not defined $skiptrg{$t}){next;}

	$count++;
	if ($progress){
	    if (not ($count % 5000)){
		$|=1;print STDERR '.';$|=0;
	    }
	    if (not ($count % 100000)){
		$|=1;print STDERR "$skipped skipped ($count)\n";$|=0;
	    }
	}

	my $total=&GetPairFreq($coocfreq,$s,$t);
	if (($total+$skipsrc{$s})<$minfreq){$skipped++;next;}
	if (($total+$skiptrg{$t})<$minfreq){$skipped++;next;}
	$freq{$s}{$t}++;
    }
    close F;

    #----------------------------------------------------------------------
    if ($progress){
	$|=1;print STDERR "\nwrite remaining buffer\n";$|=0;
    }
    #----------------------------------------------------------------------

    my $count=0;
    my $written=0;
    foreach my $s (keys %freq){
	foreach my $t (keys %{$freq{$s}}){
	    $count++;
	    if ($progress){
		if (not ($count % 5000)){
		    $|=1;print STDERR '.';$|=0;
		}
		if (not ($count % 100000)){
		    $|=1;print STDERR "$written saved ($count)\n";$|=0;
		}
	    }
	    my $total=$freq{$s}{$t}+&GetPairFreq($coocfreq,$s,$t);
	    if ($total<$minfreq){next;}
#	    my $data=Uplug::Data::Tree->new;
	    my $data=Uplug::Data->new();
	    $data->setAttribute('source',$s);
	    $data->setAttribute('target',$t);
	    $data->setAttribute('freq',$freq{$s}{$t});
	    $coocfreq->write($data);
	    $written++;
	}
    }
}


END{
    if (-e $TmpBuffer){
	unlink $TmpBuffer;
    }
}



sub GetPairFreq{
    my ($db,$src,$trg)=@_;
#    my $sel=Uplug::Data::Tree->new;
    my $sel=Uplug::Data->new();
    my %pattern=('source' => $src,
		 'target' => $trg);
    if ($db->select($sel,\%pattern)){
	return $sel->attribute('freq');
    }
    return undef;
}

sub GetTokFreq{
    my ($db,$tok)=@_;
#    my $sel=Uplug::Data::Tree->new;
    my $sel=Uplug::Data->new();
    my %pattern=('token' => $tok);
    if ($db->select($sel,\%pattern)){
	return $sel->attribute('freq');
    }
    return undef;
}


#---------------------------------------------------
# WriteFreq: save token frequencies

sub WriteFreq{
    my ($stream,$TokFreq,$MinFreq,$PrintProgr)=@_;
    my $src;
    if ($PrintProgr){
	print STDERR "write frequencies\n";
    }
    foreach (keys %{$TokFreq}){


	my $total=$$TokFreq{$_};
	my $freq=$total;
	if ($First{$stream}){
	    if ($total<$MinFreq){                   # if freq < MinPairFreq
#		my $sel=Uplug::Data::Tree->new;     #   query the database
		my $sel=Uplug::Data->new();
		my %pattern=('token' => $_);        #   and get the total freq
		if ($stream->select($sel,\%pattern)){
		    $total+=$sel->attribute('freq');
		}
	    }
	}
	if ($total<$MinFreq){next;}

#	my $data=Uplug::Data::Tree->new;
	my $data=Uplug::Data->new();
	$data->setData({token => $_,
			freq => $freq});
#	$data->setAttribute('token',$_);
#	$data->setAttribute('freq',$freq);
	$stream->write($data);
	delete $$TokFreq{$_};
    }
    $First{$stream}=1;
}



sub makeRelPosFeature{
    my ($src,$trg)=@_;
    if ($src=~/pos\((\-?[0-9]+)\)/){
	my $srcPos=$1;
	$src=~s/pos\((\-?[0-9]+)\)/x/;
	if ($trg=~/pos\((\-?[0-9]+)\)/){
	    my $relPos=$1-$srcPos;
	    $trg=~s/pos\((\-?[0-9]+)\)/$relPos/;
	}
    }
    return ($src,$trg);
}


#---------------------------------------------------------------------------

sub GetDefaultIni{

    my $DefaultIni = 
{
  'module' => {
    'program' => 'coocfreq.pl',
    'location' => '$UplugBin',
    'name' => 'co-occurrence frequency counter',
#    'stdin' => 'bitext',
  },
  'description' => 'This modules counts co-occurrence frequencies of
  words and phrases.',
  'input' => {
    'bitext' => {
	'format' => 'xces align',
    },
  },
  'output' => {
    'cooc freq' => {
      'stream name' => 'cooc freq',
    },
    'source freq' => {
      'stream name' => 'source freq',
    },
    'target freq' => {
      'stream name' => 'target freq',
    }
  },
  'parameter' => {
    'token' => {
      'chunks (source)' => 'c.*',            # use marked chunks
      'chunks (target)' => 'c.*',            # use marked chunks
#      'minimal length diff' => 0.1,
#      'matching word class' => 'same',       # don't mix content and stop words
      'minimal frequency' => 2,
      'minimal frequency (source)' => 2,
      'minimal frequency (target)' => 2,
#      'minimal length (source)' => 2,
#      'minimal length (target)' => 2,
      'maximal ngram length (source)' => 1,  # >1 --> use N-grams
      'maximal ngram length (target)' => 1,  # >1 --> use N-grams
#      'use attribute (source)' => 'stem',
#      'use attribute (target)' => 'stem',
#      'grep token (source)' => 'alphabetic',
#      'grep token (target)' => 'alphabetic',
      'lower case (source)' => 1,
      'lower case (target)' => 1,
      'exclude stop words (source)' => 0,
      'exclude stop words (target)' => 0,
#      'language (source)' => 'english',
#      'language (target)' => 'swedish',
      'language (source)' => 'default',
      'language (target)' => 'default',
      'delimiter' => '\\s+',
      'token label' => 'w',
      'remove linked' => 1,
    },
    'runtime' => {
      'print progress' => 1,       # verbose output
      'buffer' => 2000000,         # number of token pairs buffered in a hash
      'source buffer' => 2000000,  # source token buffer
      'target buffer' => 2000000,  # target token buffer
      #------------------------------------------------------------
      # clean buffer: 
      # if set to 1: remove low-frequency-pairs from the buffer in
      #              cases of buffer overflows
      'clean buffer' => 1,
      #------------------------------------------------------------
    },
  },
  'arguments' => {
    'shortcuts' => {
       'in' => 'input:bitext:file',
       'infile' => 'input:bitext:file',
       'informat' => 'input:bitext:format',
       'src' => 'output:source freq:file',
       'trg' => 'output:target freq:file',
       'cooc' => 'output:cooc freq:file',
       'freq' => 'parameter:token:minimal frequency',
       'srclang' => 'parameter:token:language (source)',
       'trglang' => 'parameter:token:language (target)',
       'max' => 'parameter:runtime:max nr of segments',
       'buf' => 'parameter:runtime:buffer',
       'clean' => 'parameter:runtime:clean buffer',
    }
  },
  'widgets' => {
  }
};

    return %{$DefaultIni};
}