The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
#
# ngramfreq.pl: count n-gram frequencies
#
#---------------------------------------------------------------------------
# Copyright (C) 2004 Jörg Tiedemann  <joerg@stp.ling.uu.se>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#---------------------------------------------------------------------------
# $Id$
#
# usage: 
#
#

use strict;
use FindBin qw($Bin);
use lib "$Bin/../lib";

use Uplug::IO::Any;
use Uplug::Data::Lang;
use Uplug::Data;
use Uplug::Config;

my %IniData=&GetDefaultIni;
my $IniFile='ngramfreq.ini';
&CheckParameter(\%IniData,\@ARGV,$IniFile);

#---------------------------------------------------------------------------

my ($InputStreamName,$InputStream)=             # input stream
    each %{$IniData{'input'}};
my ($OutputStreamName,$OutputStream)=             # output stream
    each %{$IniData{'output'}};

my $input=Uplug::IO::Any->new($InputStream);
my $ngramfreq=Uplug::IO::Any->new($OutputStream);

#---------------------------------------------------------------------------

$input->open('read',$InputStream);
$ngramfreq->open('write',$OutputStream);

#---------------------------------------------------------------------------

my $Param={};
$Param=$IniData{parameter};

my $MinFreq=$IniData{parameter}{token}{'minimal frequency'};
my $lang=$IniData{parameter}{token}{'language'};
my $ExclStop=$IniData{parameter}{token}{'exclude stop words'};

my $MinLen=$IniData{parameter}{token}{'minimal ngram length'};
my $MaxLen=$IniData{parameter}{token}{'maximal ngram length'};

my $PrintProgr=$IniData{'parameter'}{'runtime'}{'print progress'};
my $Buffer=$IniData{'parameter'}{'runtime'}{'buffer'};
my $MaxSegments=$IniData{'parameter'}{'runtime'}{'max nr of segments'};

my %First;

#---------------------------------------------------------------------------

if ($PrintProgr){
    print STDERR "read sentences\n";
}

my $count=0;
my $SegCount=0;
my %NgramFreq;
my %LenFreq;
my %LenTypeFreq;
my $TotalFreq;

my $data=Uplug::Data::Lang->new($lang);

my $time=time();
while ($input->read($data)){
    $count++;
    $SegCount++;

    if ($PrintProgr){
	if (not ($SegCount % 100)){
	    $|=1;
	    print STDERR "$SegCount segments (";
	    print STDERR time()-$time;
	    print STDERR " sec, $TotalFreq)\n";
	    $|=0;
	    if ($MaxSegments){
		if ($SegCount>$MaxSegments){last;}
	    }
	}
    }

    my @Nodes=();
    my @Ngrams=$data->getPhrases($$Param{token},\@Nodes);

    foreach my $t (0..$#Ngrams){
	if ($ExclStop and $data->isStopWord($Ngrams[$t])){next;}
	my $len=$#{$Nodes[$t]};
	if (($len==0) or (($len>$MinLen-3) and ($len<$MaxLen))){
	    if (not defined $NgramFreq{$Ngrams[$t]}){
		$TotalFreq++;
		$LenTypeFreq{$len+1}++;
	    }
	    $NgramFreq{$Ngrams[$t]}++;
	    $LenFreq{$len+1}++;
	}
	if ($Buffer and (not ($TotalFreq % $Buffer))){
	    &WriteFreq($ngramfreq,\%NgramFreq,$MinFreq,$PrintProgr);
	}
    }
}
$input->close;

#---------------------------------------------------------------------------

&WriteFreq($ngramfreq,\%NgramFreq,$MinFreq,$PrintProgr);

my %header=%{$$Param{token}};
$header{'ngram type freq'}=$TotalFreq;
foreach (keys %LenFreq){
    $header{"$_-gram freq"}=$LenFreq{$_};
    $header{"$_-gram type freq"}=$LenTypeFreq{$_};
}
$ngramfreq->addheader(\%header);
$ngramfreq->writeheader;
$ngramfreq->close;


#---------------------------------------------------------------------------

sub WriteFreq{
    my ($stream,$TokFreq,$MinFreq,$PrintProgr)=@_;
    my $src;
    if ($PrintProgr){
	print STDERR "write frequencies\n";
    }
    foreach (keys %{$TokFreq}){


	my $total=$$TokFreq{$_};
	my $freq=$total;
	if ($First{$stream}){
	    if ($total<$MinFreq){                   # if freq < MinPairFreq
		my $sel=Uplug::Data->new;         #   query the database
		my %pattern=('token' => $_);        #   and get the total freq
		if ($stream->select($sel,\%pattern)){
		    $total+=$sel->attribute('freq');
		}
	    }
	}
	if ($total<$MinFreq){next;}

	my $data=Uplug::Data->new;
	$data->setAttribute('token',$_);
	$data->setAttribute('freq',$freq);
	$stream->write($data);
	delete $$TokFreq{$_};
    }
    $First{$stream}=1;
}











#---------------------------------------------------------------------------

sub GetDefaultIni{

    my $DefaultIni = {
  'module' => {
    'name' => 'N-gram frequencies',
    'program' => 'ngramfreq.pl',
    'location' => '$UplugBin',
    'stin' => 'text',
    'stdout' => 'text',
  },
  'input' => {
    'text' => {
      'write_mode' => 'write',
      'format' => 'xml',
      'root' => 's',
    }
  },
  'output' => {
    'ngram freq' => {
      'format' => 'DBM',
      'key' => ['token'],
      'write_mode' => 'overwrite',
    },
  },
  'parameter' => {
    'token' => {
      'minimal frequency' => 2,
      'minimal length' => 1,
      'minimal ngram length' => 2,
      'maximal ngram length' => 3,
      'use attribute' => 'stem',
#      'grep token' => 'contains alphabetic',
      'lower case' => 1,
      'exclude stop words' => 0,
      'language' => 'default',
      'token label' => 'w',
    },
    'runtime' => {
      'print progress' => 1,
      'max nr of segments' => 0,
      'buffer' => 10000000,
    },
  },
  'arguments' => {
    'shortcuts' => {
       'in' => 'input:text:file',
       'infile' => 'input:text:file',
       'informat' => 'input:text:format',
       'max' => 'parameter:runtime:max nr of segments',
       'out' => 'output:ngram freq:file',
       'freq' => 'output:ngram freq:file',
       'ngram' => 'output:ngram freq:file',
       'lang' => 'parameter:token:language',
    }
  },
};
    return %{$DefaultIni};
}