The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
#-*-perl-*-
#
# uplug_malt: a simple UPLUG wrapper for MaltParser
#
#---------------------------------------------------------------------------
# Copyright (C) 2004-2011 Jörg Tiedemann  <jorg.tiedemann@lingfil.uu.se>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#---------------------------------------------------------------------------
# $Id: tag.pl,v 1.3 2008/04/03 11:16:21 joerg72 Exp $
#----------------------------------------------------------------------------
# 

use strict;
use FindBin qw($Bin);
use lib "$Bin/../lib";

use Uplug::Data;
use Uplug::IO::Any;
use Uplug::Config;
use Uplug::Encoding;
use Encode;
use Cwd;

my %IniData=&GetDefaultIni;
my $IniFile='tag.ini';
&CheckParameter(\%IniData,\@ARGV,$IniFile);

#---------------------------------------------------------------------------

my ($InputStreamName,$InputStream)=
    each %{$IniData{'input'}};               # the first input stream;
my ($OutputStreamName,$OutputStream)=         # take only
    each %{$IniData{'output'}};               # the first output stream

my $input=Uplug::IO::Any->new($InputStream);
my $output=Uplug::IO::Any->new($OutputStream);

#---------------------------------------------------------------------------


my $UplugHome = &shared_home;
my $MaltHome  = $UplugHome.'/ext/parser/malt';
my $MaltModel = $IniData{parameter}{parser}{model} || "engmalt.linear";
my $JavaCmd   = $IniData{parameter}{java} || "java -Xmx2048m";


my $TmpUnparsed=Uplug::IO::Any::GetTempFileName;
my $TmpParsed=Uplug::IO::Any::GetTempFileName;

#---------------------------------------------------------------------------

my $data=Uplug::Data->new;

print STDERR "uplug_malt: create temporary text file!\n";

$input->open('read',$InputStream);
my $UplugEncoding=$input->getInternalEncoding();
my $OutEncoding=$IniData{parameter}{output}{encoding};
if (not defined $OutEncoding){$OutEncoding=$UplugEncoding;}
my $InEncoding=$IniData{parameter}{input}{encoding};
if (not defined $InEncoding){$InEncoding=$OutEncoding;}


## read data from input stream and convert to input format needed 
## by the external tagger

open F,">$TmpUnparsed";
binmode(F,":encoding($OutEncoding)");
while ($input->read($data)){
    my @nodes=$data->contentElements;
    my $count = 0;
    foreach my $n (@nodes){
	$count++;
	my $attr = $n->attributes();
	my $pos = 
	    exists $$attr{hun} ? $$attr{hun} :
	    exists $$attr{melt} ? $$attr{melt} :
	    exists $$attr{svmtool} ? $$attr{svmtool} : 
	    exists $$attr{pos} ? $$attr{pos} : '-';
	my $word = $n->content();

	# ID will be used for deprel arcs
	my $id = exists $$attr{id} ? $$attr{id} : $count;
	$id =~s/^.*\.([0-9]+)$/$1/;

	# TODO: are these subst necessary?
	$word=~s/^\t*([^\t]+)\t*/$1/;
	$word=~s/^\s*//;
	$word=~s/\s*$//;
	$pos=~s/^([^\ ]+) .*$/$1/;

	## handle malformed data by converting to octets and back
	## the sub in encode ensures that malformed characters are ignored!
	## (see http://perldoc.perl.org/Encode.html#Handling-Malformed-Data)
	if ($OutEncoding ne $UplugEncoding){
	    my $octets = encode($OutEncoding, $word,sub{ return '' });
	    $word = decode($OutEncoding, $octets);
	}
	print F "$id\t$word\t\_\t$pos\t$pos\t\_\t\_\t\_\t\_\t\_\n";
    }
    print F "\n";
}

close F;
$input->close;


# my $currentDir = getcwd();
# chdir $MaltHome;

my $command = $JavaCmd.' -jar malt.jar -c '.$MaltModel." -i $TmpUnparsed -o $TmpParsed -m parse";

print STDERR "uplug_malt: call MaltParser!\n";
print STDERR "            cd MaltHome;$command\n";

if (my $sig=system "cd $MaltHome;$command;"){
    die "# uplug_malt: Got signal $? from MaltParser! ($@)\n$command\n";
}

print STDERR "uplug_malt: read parsed date and create output file!\n";

$input->open('read',$InputStream);
$output->open('write',$OutputStream);
open F,"<$TmpParsed";
binmode(F,":encoding($OutEncoding)");
my $data=Uplug::Data->new;    # use a new data-object (new XML parser!)
my $ret;
while ($ret=$input->read($data)){
    my @nodes=$data->contentElements;

    my @deps=();

    while(<F>){
	last if(/^\s*$/);

	if (/^[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t(\d*)\t([^\t]+)\t[^\t]+\t[^\t]+\s*$/){
	    push(@deps,($1,$2));
	}
    }
    if (@deps != 2 * @nodes){
	print STDERR "number of tokens does not match number of dep-rels!\n";
	## TODO: try to match words .....?!
    }

    foreach my $n (@nodes){
	$data->setAttribute($n,'head',shift(@deps));
	$data->setAttribute($n,'deprel',shift(@deps));
    }
    $output->write($data);
}

close F;
$input->close;
$output->close;




END {
    unlink $TmpUnparsed;
    unlink $TmpParsed;
}

############################################################################

sub GetDefaultIni{

    my $DefaultIni = 
{
  'input' => {
    'text' => {
      'format' => 'xml',
      'root' => 's',
    }
  },
  'output' => {
    'text' => {
      'format' => 'xml',
      'root' => 's',
      'write_mode' => 'overwrite',
#     'encoding' => 'iso-8859-1',
      'status' => 'malt',
    }
  },
  'required' => {
    'text' => {
      'words' => undef
    }
  },
  'module' => {
    'program' => 'uplug_malt',
    'location' => '$UplugBin',
    'name' => 'MaltParser (English)',
    'stdout' => 'text'
  },
  'arguments' => {
    'shortcuts' => {
      'in' => 'input:text:file',
      'out' => 'output:text:file',
      'm' => 'parameter:parser:model',
      'j' => 'parameter:java',
    }
  },
};

return %{$DefaultIni};
}