The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
#-*-perl-*-
#
# Copyright (C) 2004 Jörg Tiedemann  <joerg@stp.ling.uu.se>
#
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#---------------------------------------------------------------------------
#
# chunk.pl: a simple UPLUG wrapper for a "chunk-tagger"
#
# usage: chunk.pl <infile >outfile
#        chunk.pl [-i config] [-in in] [-out out] [-l language] [-s system]
#
# config      : configuration file
# in          : input file (source language)
# out         : output file
# l           : language (requires a startup script in './chunker/')
# system      : Uplug system (subdirectory of UPLUGSYSTEM)
#
#
# $Author$
# $Id$
#----------------------------------------------------------------------------
#
#            * requires a startup script for an external chunk
#              in the directory 'chunker/' 
#              (relative to the UPLUG home directory)
#            * default startup-script: chunker_$language
#            * default language: english
#            * default POS attribute: pos
#            * default input format for the chunker:
#                   1 sentence per line, 
#                   each token separated by <SPACE>,
#                   each token is tagged with POS tags
#            * default chunker output:
#                   1 sentence per line, chunk-tags are appended to each token
#                   (token1/pos1/tag1 token2/pos2/tag2 token3/pos3/tag3 ...)
#            * default chunk-tag-name: 'chunk'
#
# 

use strict;
use FindBin qw($Bin);
use lib "$Bin/../lib";

use Uplug::Data;
use Uplug::IO::Any;
use Uplug::Config;
use Encode;

my $UplugHome="$Bin/../";
$ENV{UPLUGHOME}=$UplugHome;

my %IniData=&GetDefaultIni;
my $IniFile='chunk.ini';
&CheckParameter(\%IniData,\@ARGV,$IniFile);

#---------------------------------------------------------------------------

my ($InputStreamName,$InputStream)=
    each %{$IniData{'input'}};               # the first input stream;
my ($OutputStreamName,$OutputStream)=         # take only
    each %{$IniData{'output'}};               # the first output stream

my $input=Uplug::IO::Any->new($InputStream);
my $output=Uplug::IO::Any->new($OutputStream);

#---------------------------------------------------------------------------

my $lang=$IniData{parameter}{chunker}{language};
my $prg=$IniData{parameter}{chunker}{'startup base'};
my $POSattr=$IniData{parameter}{input}{'POS attribute'};
my $InTokDel=$IniData{parameter}{input}{'token delimiter'};
my $OutTokDel=$IniData{parameter}{output}{'token delimiter'};
my $InSentDel=$IniData{parameter}{input}{'sentence delimiter'};
my $OutSentDel=$IniData{parameter}{output}{'sentence delimiter'};
my $InTagDel=$IniData{parameter}{input}{'POS tag delimiter'};
my $OutTagDel=$IniData{parameter}{output}{'POS tag delimiter'};
my $ChunkTagDel=$IniData{parameter}{output}{'chunk tag delimiter'};
my $ChunkTag=$IniData{parameter}{output}{'chunk tag'};
my %TokReplace=();
if (ref($IniData{parameter}{'input token replacements'}) eq 'HASH'){
    %TokReplace=%{$IniData{parameter}{'input token replacements'}};
}
my %TagReplace=();
if (ref($IniData{parameter}{'input tag replacements'}) eq 'HASH'){
    %TagReplace=%{$IniData{parameter}{'input tag replacements'}};
}
my %InputReplace=();
if (ref($IniData{parameter}{'input replacements'}) eq 'HASH'){
    %InputReplace=%{$IniData{parameter}{'input replacements'}};
}
my %OutputReplace=();
if (ref($IniData{parameter}{'output replacements'}) eq 'HASH'){
    %OutputReplace=%{$IniData{parameter}{'output replacements'}};
}

#---------------------------------------------------------------------------

my $UplugHome = &shared_home;

my $ChunkerDir=$UplugHome.'/ext/chunker/';
my $TmpUntagged=Uplug::IO::Any::GetTempFileName;
my $TmpTagged=Uplug::IO::Any::GetTempFileName;

my $ChunkerPrg=$ChunkerDir.$prg.$lang;

#---------------------------------------------------------------------------

my $data=Uplug::Data->new;

$input->open('read',$InputStream);
my $UplugEncoding=$input->getInternalEncoding();

my $InEncoding=$IniData{parameter}{input}{encoding};
my $OutEncoding=$IniData{parameter}{output}{encoding};
my $ModelEncoding=$IniData{parameter}{chunker}{encoding};

if (not defined $ModelEncoding){$ModelEncoding=$UplugEncoding;}
if (not defined $InEncoding){$InEncoding=$ModelEncoding;}
if (not defined $OutEncoding){$OutEncoding=$InEncoding;}

open F,">$TmpUntagged";

while ($input->read($data)){

    my @tok   = ();
    my @nodes = $data->contentNodesEncoded($InEncoding,\@tok);
    my @attr  = $data->attribute(\@nodes);

    map(s/^\s*//,@tok);                    # remove initial white-spaces
    map(s/\s*$//,@tok);                    # remove final white-spaces

    map($tok[$_]=&FixChunkerData($tok[$_],\%TokReplace),0..$#tok);
    foreach (0..$#tok){
	if ($tok[$_]!~/\S/){next;}
	if (defined $attr[$_]{$POSattr}){
	    $attr[$_]{$POSattr}=&FixChunkerData($attr[$_]{$POSattr},
						\%TagReplace);
	    $tok[$_].=$InTagDel.$attr[$_]{$POSattr};
	}
    }
    map($tok[$_]=&FixChunkerData($tok[$_],\%InputReplace),0..$#tok);

    # print them if any left
    if (@tok){
	print F join $InTokDel,@tok;
	print F $InSentDel;
    }
}

close F;
$input->close;

#---------------------------------------------------------------------------

if (my $sig=system "$ChunkerPrg $TmpUntagged >$TmpTagged"){
    die "# chunk: Got signal $? from $ChunkerPrg!\n";
}

#---------------------------------------------------------------------------

my $InputSeperator=$/;
$/=$OutSentDel;

$input->open('read',$InputStream);
$output->open('write',$OutputStream);
open F,"<$TmpTagged";

# $data->initXmlParser();

my $data=Uplug::Data->new;    # use a new data-object (new XML parser!)
while ($input->read($data)){
    my $tagged=undef;

    my @cont = $data->contentNodesEncoded($InEncoding);
    if (not @cont){$output->write($data);next;}

    my @tok=();
    $/=$OutSentDel;
    $tagged=<F>;
    $tagged=&FixChunkerData($tagged,\%OutputReplace);
    chomp $tagged;
    @tok=split(/$OutTokDel/,$tagged);

    $/=$InputSeperator;
    my @label=@tok;
    map(s/^.*$InTagDel//,@label);
    map(s/^(.*)$InTagDel.*$/$1/,@tok);
    my @tag=@tok;
    map(s/^.*$ChunkTagDel//,@tag);
    map(s/^(.*)$ChunkTagDel.*$/$1/,@tok);

    my $id=$data->attribute('id');
    $id=~s/^./c/;

    &AddChunks($data,\@cont,\@tok,\@tag,\@label,$id);
    $output->write($data);
}
close F;
$input->close;
$output->close;

$/=$InputSeperator;

END {
    unlink $TmpUntagged;
    unlink $TmpTagged;
}

############################################################################

sub AddChunks{
    my ($data,$nodes,$tokens,$tags,$labels,$id)=@_;

    my @words=();
    my @attr=();
    my $count=0;

    if (@{$nodes} != @{$tokens}){
	print STDERR "# chunk.pl - warning: ";
	print STDERR scalar @{$nodes}," tokens but ",scalar @{$tokens}," tags!!\n";
	return 0;
    }

    my ($type,$length);
    while (($type,$length)=&GetNextChunk($labels)){
	$count++;
	my @children=();
	foreach (0..$length){
	    $children[$_]=shift @{$nodes};
	}
	if (not $type){next;}              # no type --> no chunk!!
	my %ParentAttr=();
	$ParentAttr{type}=$type;
	$ParentAttr{id}=$id."-$count";
	$data->addParent(\@children,'chunk',\%ParentAttr);
    }
}

sub GetNextChunk{
    my ($labels)=@_;
    my $length=1;

    if (not @{$labels}){return ();}
    my $l=shift(@{$labels});

    my $PrevPos;
    my $PrevChunk=undef;
    my $length=0;

    if ($l=~/^(.)/){$PrevPos=$1;}
    if ($l=~/\-(.*)$/){$PrevChunk=$1;}

    while (@{$labels}){

	my $CurrentPos=undef;
	my $CurrentChunk=undef;
	if ($labels->[0]=~/^(.)/){$CurrentPos=$1;}
	if ($labels->[0]=~/\-(.*)$/){$CurrentChunk=$1;}

	if (($PrevPos eq 'O') or
	    ($CurrentPos=~/[B|O]/) or
	    ($PrevChunk ne $CurrentChunk)){
	    return ($PrevChunk,$length);
	}
	$length++;
	shift(@{$labels});
	$PrevPos=$CurrentPos;
	$PrevChunk=$CurrentChunk;
    }
    return ($PrevChunk,$length);
}



sub FixChunkerData{
    my ($string,$subst)=@_;
    foreach (keys %{$subst}){
	$string=~s/$_/$subst->{$_}/sg;
    }
    return $string;
}


sub GetDefaultIni{

    my $DefaultIni = 
{
  'input' => {
    'text' => {
      'format' => 'xml',
      'root' => 's',
    }
  },
  'output' => {
    'text' => {
      'format' => 'xml',
      'root' => 's',
      'write_mode' => 'overwrite',
#	'encoding' => 'iso-8859-1',
	'status' => 'chunk',
    }
  },
  'parameter' => {
    'input' => {
      'token delimiter' => ' ',
      'sentence delimiter' => '
',
      'POS tag delimiter' => '/',
      'POS attribute' => 'pos'
    },
    'chunker' => {
      'language' => 'english',
      'startup base' => 'chunker_'
    },
    'output' => {
      'token delimiter' => '\\s+',
      'chunk tag delimiter' => '\\/',
      'sentence delimiter' => '
',
      'chunk tag' => 'chunk',
      'POS tag delimiter' => '\\/',
	'encoding' => 'iso-8859-1',
    },
    'input token replacements' => {
      '\\,' => 'COMMA',
        ' ' => '_',
    },
    'input tag replacements' => {
      '\\,' => 'COMMA'
    }
  },
  'module' => {
    'program' => 'chunk.pl',
    'location' => '$UplugBin',
    'name' => 'chunker (english)',
    'stdout' => 'text'
  },
  'arguments' => {
    'shortcuts' => {
      'in' => 'input:text:file',
      'out' => 'output:text:file',
      'lang' => 'parameter:chunker:language',
      'in' => 'input:text:file',
      'pos' => 'parameter:input:POS attribute',
       'char' => 'output:text:encoding',
       'inchar' => 'input:text:encoding',
       'outchar' => 'output:text:encoding',
       'tag' => 'parameter:output:chunk tag',
    }
  },
  'widgets' => {
       'input' => {
	  'text' => {
	    'stream name' => 'stream(format=xml,status=tag,language=en)'
	  },
       },
       'parameter' => {
          'output' => {
	     'chunk tag' => 'optionmenu (chunk,c)',
	  },
          'input' => {
	     'POS attribute' => 'optionmenu (pos,grok,tnt)',
	  }
       }
  }
};

    return %{$DefaultIni};
}