The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Lingua::Align::Corpus::Treebank::Penn;

use 5.005;
use strict;

use Lingua::Align::Corpus::Treebank;
use File::Basename;

use vars qw(@ISA);
@ISA = qw(Lingua::Align::Corpus::Treebank);

sub new{
    my $class=shift;
    my %attr=@_;

    my $self={};
    bless $self,$class;

    foreach (keys %attr){
	$self->{$_}=$attr{$_};
    }

    return $self;
}

sub next_sentence_id{
    my $self=shift;

    if ($self->{NO_SENTID_FILE}){
	$self->{SENTCOUNT}++;
	return $self->{SENTCOUNT};
    }

    my $file=shift || $self->{-id_file};
    if (! defined $self->{FH}->{$file}){
	if (not defined $file){
	    my $base=basename($self->{-file});
	    my $dir=dirname($self->{-file});
	    $base=~s/\..*$/.ids/;
	    if (-e $dir.'/text/'.$base){
		$file=$dir.'/text/'.$base;
	    }
	    elsif (-e $dir.'/text/'.$base.'.gz'){
		$file=$dir.'/text/'.$base.'.gz';
	    }
	    else{
		$self->{NO_SENTID_FILE}=1;
	    }
	    $self->{-id_file}=$file;
	    return $self->next_sentence_id();
	}
	else{
	    $self->{FH}->{$file} = $self->open_file($file);
	}
    }

    my $fh=$self->{FH}->{$file};
    my $id = <$fh>;
    chomp $id;

    if ($id=~/^(.*)\-([^-]+)$/){
	return $2;
    }
    return $id;
}


sub read_next_sentence{
    my $self=shift;
    my $tree=shift;
    %{$tree}=();

    my $file=shift || $self->{-file};
    if (! defined $self->{FH}->{$file}){
	$self->{FH}->{$file} = $self->open_file($file);
	$self->{-file}=$file;
    }
    my $fh=$self->{FH}->{$file};

    $self->__initialize_parser($tree);
    $tree->{ID}=$self->next_sentence_id();

    # look for a possible tree start
    while(<$fh>){
	next if (/Sentence skipped: /);             # stanford parser: skips
	if (/SENTENCE_SKIPPED_OR_UNPARSABLE/){      # some sentences ...
	    $self->__initialize_parser($tree);      # -> return next sentences!
	    $tree->{ID}=$self->next_sentence_id();  # (with next ID)
	    next;
#	    return $self->read_next_sentence($tree,$file,@_); # return next!!!
	}
	last if (/^\s*\(/)
    }

    # parse until first empty line
    if (defined $_){
	do {
	    if (/\S/){
		return 1 if ($self->__parse($_,$tree));
	    }
#	    $self->__parse($_,$tree);
	    $_=<$fh>;
	}
	until ($_!~/\S/);
	return 1;
    }
    return 0;

#    while (<$fh>){
#	chomp;
#	next if ($_!~/\S/);
#	return 1 if ($self->__parse($_,$tree));
#    }
#    return 0;

}

sub __initialize_parser{
    my $self=shift;
    $self->{OPEN_BRACKETS}=0;
    $self->{NODECOUNT}=0;
    delete $self->{CURRENTNODE};
}

sub __parse{
    my $self=shift;
    my ($string,$tree)=@_;

    while ($string=~/\S/){

	# terminal node!
	if ($string=~/^\s*\((\S+)\s+(\S+?)\)(.*)$/){
	    my ($pos,$word)=($1,$2);
	    if ($word eq '-RRB-'){$word=')';}        # brackets
	    elsif ($word eq '-LRB-'){$word='(';}
	    $string=$3;

	    $self->{NODECOUNT}++;
#	    my $node = 500+$self->{NODECOUNT};
	    my $node = $self->{NODECOUNT};
	    if ($pos=~s/\-(s?[0-9]+\_[0-9]+)\-([0-9]+)$//){
		$node = $1;
	    }
	    elsif ($pos=~s/\-([0-9]+)$//){
		$node=$tree->{ID}.'_'.$1;
	    }
	    else{
		$node=$tree->{ID}.'_'.$node;
	    }
	    $tree->{NODES}->{$node}->{id} = $node;
	    $tree->{NODES}->{$node}->{pos} = $pos;
	    $tree->{NODES}->{$node}->{word} = $word;
	    push(@{$tree->{TERMINALS}},$node);

	    my $parent = $self->{CURRENTNODE};
	    push(@{$tree->{NODES}->{$node}->{PARENTS}},$parent);
	    push(@{$tree->{NODES}->{$parent}->{CHILDREN}},$node);
#	    push(@{$tree->{NODES}->{$node}->{RELATION}},'--');
	    push(@{$tree->{NODES}->{$parent}->{RELATION}},'--');

	}

	elsif ($string=~/^\s*\((\S+)(.*)$/){
	    my $cat=$1;
	    $string=$2;

	    $self->{NODECOUNT}++;
	    $self->{OPEN_BRACKETS}++;
#	    my $node = $self->{NODECOUNT};
	    my $node = 500+$self->{NODECOUNT};

	    if ($cat=~s/\-(s?[0-9]+\_[0-9]+)\-([0-9]+)$//){
		$node = $1;
	    }
	    elsif ($cat=~s/\-([0-9]+)$//){
		$node=$tree->{ID}.'_'.$1;
	    }
	    else{
		$node=$tree->{ID}.'_'.$node;
	    }

	    $tree->{NODES}->{$node}->{cat} = $cat;
	    $tree->{NODES}->{$node}->{id} = $node;

	    if ($self->{NODECOUNT} == 1){
		$tree->{ROOTNODE} = $node;
	    }
	    else{
		my $parent = $self->{CURRENTNODE};
		push(@{$tree->{NODES}->{$node}->{PARENTS}},$parent);
		push(@{$tree->{NODES}->{$parent}->{CHILDREN}},$node);
#		push(@{$tree->{NODES}->{$node}->{RELATION}},'--');
		push(@{$tree->{NODES}->{$parent}->{RELATION}},'--');
	    }

	    $self->{CURRENTNODE} = $node;

	}

	elsif ($string=~/^\s*\)(.*)$/){
	    $string=$1;
	    my $node = $self->{CURRENTNODE};
	    my $parent = $tree->{NODES}->{$node}->{PARENTS}->[0];
	    $self->{CURRENTNODE}=$parent;
	    $self->{OPEN_BRACKETS}--;
	}

	# something is wrong!
	elsif ($string=~/\S/){
	    return 0;
	}

    }

    return 1 if ($self->{OPEN_BRACKETS} == 0);
    return 0;

}



sub print_tree{
    my $self=shift;
    my $tree=shift;

    my $ids=shift || [];
    my $node = shift || $tree->{ROOTNODE};

    my $string.='(';
    if (defined $tree->{NODES}->{$node}->{cat}){
	$string.=$tree->{NODES}->{$node}->{cat};
    }
    elsif (defined $tree->{NODES}->{$node}->{pos}){
	$string.=$tree->{NODES}->{$node}->{pos};
    }
    elsif (defined $tree->{NODES}->{$node}->{rel}){
	$string.=$tree->{NODES}->{$node}->{rel};
    }
    # add node ID if necessary (for Dublin aligner format)
    if ($self->{-add_ids}){
#	if (not $self->{-skip_node_ids}){
	if ($self->{-add_node_ids}){
	    $string.='-'.$tree->{NODES}->{$node}->{id};
	}
	my $idx = scalar @{$ids} + 1;
	$string.='-'.$idx;
	$tree->{NODES}->{$node}->{idx}=$idx;
    }
    push (@{$ids},$tree->{NODES}->{$node}->{id});
    $string.=' ';

    if (exists $tree->{NODES}->{$node}->{CHILDREN}){
	foreach my $c (@{$tree->{NODES}->{$node}->{CHILDREN}}){
	    $string.=$self->print_tree($tree,$ids,$c);
	}
    }

    elsif (defined $tree->{NODES}->{$node}->{word}){
	$string.=$tree->{NODES}->{$node}->{word};
    }
    elsif (defined $tree->{NODES}->{$node}->{index}){
#	my $child = $tree->{NODES}->{$node}->{CHILDREN2}->[0];
	$string.='index-'.$tree->{NODES}->{$node}->{CHILDREN2}->[0];
    }
    $string.=')';
    return $string;
}

    


1;
__END__
# Below is stub documentation for your module. You'd better edit it!

=head1 NAME

Lingua::Align::Corpus::Treebank::Penn - Read the Penn Treebank format

=head1 SYNOPSIS

=head1 DESCRIPTION

=head2 EXPORT

=head1 SEE ALSO

=head1 AUTHOR

Joerg Tiedemann

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2009 by Joerg Tiedemann

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.8 or,
at your option, any later version of Perl 5 you may have available.


=cut