The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
#!/usr/bin/env perl
#-*-perl-*-
#
# tag.pl: a simple UPLUG wrapper for a (POS) tagger
#
# usage: tag.pl <infile >outfile
#        tag.pl [-i config] [-in in] [-out out] [-l language] [-s system]
#
# config      : configuration file
# in          : input file (source language)
# out         : output file
# l           : language (requires a startup script in './tagger/')
# system      : Uplug system (subdirectory of UPLUGSYSTEM)
#
#
#---------------------------------------------------------------------------
# Copyright (C) 2004 Jörg Tiedemann  <joerg@stp.ling.uu.se>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#---------------------------------------------------------------------------
# $Id$
#----------------------------------------------------------------------------
#
#            * requires a startup script for an external POS tagger
#              in the directory 'tagger/' (relative to UPLUG home directory)
#            * default startup-script: tagger_$language
#            * default language: swedish
#            * default input format for the tagger:
#                   1 sentence per line, each token separated by <SPACE>
#            * default tagger output:
#                   1 sentence per line, tags are appended to each token
#                   (token1/tag1 token2/tag2 token3/tag3 ...)
#            * default attribute name: pos
#
# 

use strict;
use FindBin qw($Bin);
use lib "$Bin/../lib";

use Uplug::Data;
use Uplug::IO::Any;
use Uplug::Config;
use Uplug::Encoding;
use Encode;

my %IniData=&GetDefaultIni;
my $IniFile='tag.ini';
&CheckParameter(\%IniData,\@ARGV,$IniFile);

#---------------------------------------------------------------------------

my ($InputStreamName,$InputStream)=
    each %{$IniData{'input'}};               # the first input stream;
my ($OutputStreamName,$OutputStream)=         # take only
    each %{$IniData{'output'}};               # the first output stream

my $input=Uplug::IO::Any->new($InputStream);
my $output=Uplug::IO::Any->new($OutputStream);

#---------------------------------------------------------------------------

# the tagger can either use a startup script in share/ext/tagger
my $lang=$IniData{parameter}{tagger}{language};
my $startup=$IniData{parameter}{tagger}{'startup base'};

# ... or a specific program and a language specific model
my $tagger=$IniData{parameter}{tagger}{program};

# additional tagger arguments
my $TaggerArgs=$IniData{parameter}{tagger}{parameter};
# tagging model (will be appended as additional argument)
my $TaggerModel=$IniData{parameter}{tagger}{model};

# set to stdin if the tagger needs piped data
my $TaggerInput=$IniData{parameter}{tagger}{input} || 'last argument';

my $attr=$IniData{parameter}{output}{attribute};
my $OutAttr=$IniData{parameter}{output}{attributes};
my $OutPattern=$IniData{parameter}{output}{pattern};
my $InTokDel=$IniData{parameter}{input}{'token delimiter'};
my $OutTokDel=$IniData{parameter}{output}{'token delimiter'};
my $InSentDel=$IniData{parameter}{input}{'sentence delimiter'};
my $OutSentDel=$IniData{parameter}{output}{'sentence delimiter'};
my $TagDel=$IniData{parameter}{output}{'tag delimiter'};
my %InputReplace=();
if (ref($IniData{parameter}{'input replacements'}) eq 'HASH'){
    %InputReplace=%{$IniData{parameter}{'input replacements'}};
}
my %OutputReplace=();
if (ref($IniData{parameter}{'output replacements'}) eq 'HASH'){
    %OutputReplace=%{$IniData{parameter}{'output replacements'}};
}
# my $OutputComments=$IniData{parameter}{output}{comments};

my @Attr=split(/:/,$OutAttr);
#---------------------------------------------------------------------------

my $TaggerPrg;
if (defined $startup){
    $TaggerPrg = &shared_home.'/ext/tagger/'.$startup;
    if (not -e $TaggerPrg){
	$TaggerPrg.=$lang;  # if necessary
    }	
}
elsif (defined $tagger){
    $TaggerPrg = &find_executable($tagger);
}
$TaggerPrg .= ' '.$TaggerArgs;
$TaggerPrg .= ' '.$TaggerModel;

my $TmpUntagged=Uplug::IO::Any::GetTempFileName;
my $TmpTagged=Uplug::IO::Any::GetTempFileName;

#---------------------------------------------------------------------------

my $data=Uplug::Data->new;

print STDERR "tag.pl: create temporary text file!\n";

$input->open('read',$InputStream);
my $UplugEncoding=$input->getInternalEncoding();

my $InEncoding=$IniData{parameter}{input}{encoding};
my $OutEncoding=$IniData{parameter}{output}{encoding};
my $ModelEncoding=$IniData{parameter}{tagger}{encoding};

if (not defined $ModelEncoding){$ModelEncoding=$UplugEncoding;}
if (not defined $InEncoding){$InEncoding=$ModelEncoding;}
if (not defined $OutEncoding){$OutEncoding=$InEncoding;}


## read data from input stream and convert to input format needed 
## by the external tagger

open F,">$TmpUntagged";
binmode(F,":encoding($InEncoding)");

while ($input->read($data)){
    my @tok = ();
    my @nodes = $data->contentNodesEncoded($InEncoding,\@tok);

    map(s/^\s*//,@tok);                    # remove initial white-spaces
    map(s/\s*$//,@tok);                    # remove final white-spaces

    map($tok[$_]=&FixTaggerData($tok[$_],\%InputReplace),0..$#tok);

    if (@tok){                             # print them if any left
	print F join $InTokDel,@tok;
	print F $InSentDel;
    }
}

close F;
$input->close;


#---------------------------------------------------------------------------
# the call to the external tagger

print STDERR "tag.pl: call external tagger!\n";
print STDERR "   $TaggerPrg $TmpUntagged >$TmpTagged\n";

my $command;
if ($TaggerInput =~/stdin/i){
    $command = "$TaggerPrg < $TmpUntagged >$TmpTagged";
}
else {
    $command = "$TaggerPrg $TmpUntagged >$TmpTagged";
}

if (my $sig=system($command)){
    die "# tag: Got signal $? from tagger $TaggerPrg!\n";
}
#---------------------------------------------------------------------------


## read the tagged data and the data from input stream again
## and add tags to the data

my $InputSeperator=$/;
print STDERR "tag.pl: read tagged file and create output data!\n";

$input->open('read',$InputStream);
$output->open('write',$OutputStream);
open F,"<$TmpTagged";

my $data=Uplug::Data->new;    # use a new data-object (new XML parser!)
my $ret;
while ($ret=$input->read($data)){

    my @cont = $data->contentNodesEncoded($InEncoding);

    ## no content nodes left --> just print the data
    if (not @cont){
	$output->write($data);
	$/=$InputSeperator;
	next;
    }

    ## read the tagged data and split into tokens
    $/=$OutSentDel;
    my $tagged=undef;
    my @tok=();
    $tagged=<F>;
    $tagged=&FixTaggerData($tagged,\%OutputReplace);
    chomp $tagged;
    @tok=split(/$OutTokDel/,$tagged);

    ## number nodes <> number tagged tokens --> do nothing
    if (@cont != @tok){
	print STDERR "# tag.pl - warning: ";
	print STDERR scalar @cont," tokens but ",scalar @tok," tags!!\n";
	$output->write($data);
	$/=$InputSeperator;
	next;
    }

    ## insert tags in XML nodes
    my $WordAttr;
    foreach my $j (0..$#Attr){
	if ($Attr[$j] eq 'word'){
	    $WordAttr=$j;
	}
    }
    if (@Attr and (defined $OutPattern)){
	foreach my $i (0..$#tok){
	    if (not ref($cont[$i])){next;}
	    if ($tok[$i]=~/$OutPattern/s){
		my @Val=($1,$2,$3,$4,$5,$6,$7,$8,$9);
		if ((@cont != @tok) and (defined $WordAttr)){
		    if ($data->content($cont[$i]) ne $Val[$WordAttr]){
			if (@cont>@tok){shift @cont;}
			else{shift @tok;}
			$i--;
			next;
		    }
		}
		foreach my $j (0..$#Attr){
		    if ($Attr[$j] eq 'word'){next;}
		    if ($Attr[$j] eq 'text'){next;}
		    if ($Val[$j]=~/\S/){
			$data->setAttribute($cont[$i],$Attr[$j],$Val[$j]);
		    }
		}
	    }
	}
    }
    else{
	my @tag=@tok;
	map(s/^.*$TagDel//,@tag);
	map(s/^(.*)$TagDel[^$TagDel].*$/$1/,@tok);
	foreach my $n (@cont){
	    $data->setAttribute($n,$attr,shift(@tag));
	    if (not @tag){last;}
	}
#	$data->setContentAttribute($attr,\@tag);
    }
    $output->write($data);
    $/=$InputSeperator;
}
close F;
$input->close;
$output->close;

$/=$InputSeperator;

END {
    unlink $TmpUntagged;
    unlink $TmpTagged;
}

############################################################################

sub FixTaggerData{
    my ($string,$subst)=@_;
    foreach (keys %{$subst}){
	$string=~s/$_/$subst->{$_}/sg;
    }
    return $string;
}


sub GetDefaultIni{

    my $DefaultIni = 
{
  'input' => {
    'text' => {
      'format' => 'xml',
      'root' => 's',
    }
  },
  'output' => {
    'text' => {
      'format' => 'xml',
      'root' => 's',
      'write_mode' => 'overwrite',
#	'encoding' => 'iso-8859-1',
	'status' => 'tagGrok',
    }
  },
  'required' => {
    'text' => {
      'words' => undef
    }
  },
  'parameter' => {
    'input' => {
      'token delimiter' => ' ',
      'sentence delimiter' => '
',
	'encoding' => 'iso-8859-1',
    },
    'output' => {
      'token delimiter' => '\\s+',
      'tag delimiter' => '\\/',
      'sentence delimiter' => '
',
      'attribute' => 'pos',
	'encoding' => 'iso-8859-1',
    },
    'tagger' => {
      'language' => 'english',
      'startup base' => 'tagger_'
    },
     'input replacements' => {
        ' ' => '_',
     },
  },
  'module' => {
    'program' => 'tag.pl',
    'location' => '$UplugBin',
    'name' => 'tagger (english)',
    'stdout' => 'text'
  },
  'arguments' => {
    'shortcuts' => {
      'in' => 'input:text:file',
      'out' => 'output:text:file',
      'lang' => 'parameter:tagger:language',
      'in' => 'input:text:file',
       'attr' => 'parameter:output:attribute',
       'char' => 'output:text:encoding',
       't' => 'parameter:tagger:startup base',
    }
  },
  'widgets' => {
       'input' => {
	  'text' => {
	    'stream name' => 'stream(format=xml,status=(tok|tag|chunk),language=en)'
	  },
       },
       'parameter' => {
          'output' => {
	     'attribute' => 'optionmenu (pos,tnt)',
	  }
       }
  }
};

return %{$DefaultIni};
}