The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Lingua::Align::Corpus::Treebank;

use 5.005;
use strict;

use vars qw(@ISA);
@ISA = qw(Lingua::Align::Corpus);

use Lingua::Align::Corpus;
use Lingua::Align::Corpus::Treebank::Penn;
use Lingua::Align::Corpus::Treebank::Berkeley;
use Lingua::Align::Corpus::Treebank::Stanford;
use Lingua::Align::Corpus::Treebank::TigerXML;
use Lingua::Align::Corpus::Treebank::AlpinoXML;


sub new{
    my $class=shift;
    my %attr=@_;

    if ($attr{-type}=~/tiger/i){
	return new Lingua::Align::Corpus::Treebank::TigerXML(%attr);
    }
    if ($attr{-type}=~/alpino/i){
	return new Lingua::Align::Corpus::Treebank::AlpinoXML(%attr);
    }
    if ($attr{-type}=~/stanford/i){
	return new Lingua::Align::Corpus::Treebank::Stanford(%attr);
    }
    if ($attr{-type}=~/berkeley/i){
	return new Lingua::Align::Corpus::Treebank::Berkeley(%attr);
    }
    return new Lingua::Align::Corpus::Treebank::Penn(%attr);
}


sub close{
    my $self=shift;
    my $file=shift || $self->{-file};
    $self->close_file($file);
}


# next sentence returns a tree for the next sentence
# (here: only virtual function ....)
sub read_next_sentence{}

sub next_sentence_id{}            # return next sentence ID and move to next
sub next_sentence_id_would_be{}   # return next sentence ID and stay at current


sub root_node{
    my $self=shift;
    my ($tree)=@_;
    if (exists $tree->{ROOTNODE}){
	return $tree->{ROOTNODE};
    }
    foreach (keys %{$tree->{NODES}}){
	if ((not exists $tree->{NODES}->{$_}->{PRENTS}) || 
	    (scalar @{$tree->{NODES}->{$_}->{PRENTS}} == 0)){
	    $tree->{ROOTNODE}=$_;
	    return $_;
	}
    }
    return undef;  # no root node? ---> no tree structure?
}

sub distance_to_root{
    my $self=shift;
    my ($tree,$node)=@_;
#    return 0 if (not defined $node);
#    return 0 if ($node eq '');
    if (exists $tree->{NODES}->{$node}->{TREELEVEL}){
	return $tree->{NODES}->{$node}->{TREELEVEL};
    }
    my $this=$node;
    my $count=0;
    while (exists $tree->{NODES}->{$this}->{PARENTS}){
	last if (scalar @{$tree->{NODES}->{$this}->{PARENTS}} == 0);
	$count++;
	$this=$tree->{NODES}->{$this}->{PARENTS}->[0];
	last if (not exists $tree->{NODES}->{$this});
    }
    $tree->{NODES}->{$node}->{TREELEVEL}=$count;
    return $count;
}

sub tree_size{
    my $self=shift;
    my $tree=shift;
    return $tree->{TREESIZE} if (exists $tree->{TREESIZE});
    my $size=0;
    foreach my $n (keys %{$tree->{NODES}}){
	my $level=$self->distance_to_root($tree,$n);
	if ($level>$size){$size=$level;}
    }
    $tree->{TREESIZE} = $size;   # cache tree size for later calls!
    return $size;
}

sub get_all_nodes{
    my $self=shift;
    my $tree=shift;
    return sort keys %{$tree->{NODES}};
}

sub get_all_leafs{
    my $self=shift;
    my ($tree,$attr)=@_;
    $attr = 'word' if (not defined $attr);
    my @words=();
    if (ref($tree->{TERMINALS}) eq 'ARRAY'){
	foreach my $n (@{$tree->{TERMINALS}}){
	    push(@words,$tree->{NODES}->{$n}->{$attr});
	}
    }
    return @words;
}

sub is_nonterminal{
    my $self=shift;
    my ($tree,$node)=@_;
    if (exists $tree->{NODES}){
	if (exists $tree->{NODES}->{$node}){
	    if (exists $tree->{NODES}->{$node}->{CHILDREN}){
		return 1;
	    }
	    if (exists $tree->{NODES}->{$node}->{CHILDREN2}){
		return 1;
	    }
	}
    }
    return 0;
}

sub is_terminal{
    my $self=shift;
    return not $self->is_nonterminal(@_);

    ## caching this information? not really necessary .... 
    # 
    # if (exists $_[0]->{NODES}){
    # 	if (exists $_[0]->{NODES}->{$_[1]}){
    # 	    if (exists $_[0]->{NODES}->{$_[1]}->{TERMINAL_NODE}){
    # 		return 1;
    # 	    }
    # 	}
    # }
    # if (not $self->is_nonterminal(@_)){
    # 	$_[0]->{NODES}->{$_[1]}->{TERMINAL_NODE} = 1;
    # 	return 1;
    # }
    # $_[0]->{NODES}->{$_[1]}->{NONTERMINAL_NODE} = 1;
    # return 0;
}

 sub is_descendent{
    my $self=shift;
    my ($tree,$desc,$anc)=@_;

    # look at ancestor relation cache first
    if (exists $tree->{__IS_ANCESTOR__}){
	if (exists $tree->{__IS_ANCESTOR__}->{$anc}){
	    if (exists $tree->{__IS_ANCESTOR__}->{$anc}->{$desc}){
		return $tree->{__IS_ANCESTOR__}->{$anc}->{$desc};
	    }
	}
    }

    my @parents=();
    if (exists $tree->{NODES}->{$desc}->{PARENTS}){
	@parents = @{$tree->{NODES}->{$desc}->{PARENTS}};
    }
    while (@parents){
	my $p=shift(@parents);
	if ($p eq $anc){
	    $tree->{__IS_ANCESTOR__}->{$anc}->{$desc}=1; # add relation to cache
	    return 1;
	}
	if ($self->is_descendent($tree,$p,$anc)){
	    $tree->{__IS_ANCESTOR__}->{$anc}->{$desc}=1;
	    $tree->{__IS_ANCESTOR__}->{$anc}->{$p}=1;
	    return 1;
	}
	$tree->{__IS_ANCESTOR__}->{$anc}->{$p}=0;
    }
    $tree->{__IS_ANCESTOR__}->{$anc}->{$desc}=0;
    return 0;
}


sub is_ancestor{
    my $self=shift;
    my ($tree,$anc,$desc)=@_;
    return $self->is_descendent($tree,$desc,$anc);
}


# get all parents for a given node in a given tree

sub parents{
    my $self=shift;
    my ($tree,$node)=@_;
    if (exists $tree->{NODES}->{$node}->{PARENTS}){
	return @{$tree->{NODES}->{$node}->{PARENTS}};
    }
    return ();
}

# get (first) parent

sub parent{
    my $self=shift;
    my ($tree,$node)=@_;
    if (exists $tree->{NODES}->{$node}->{PARENTS}){
	return $tree->{NODES}->{$node}->{PARENTS}->[0];
    }
    return undef;
}

# get all children

sub children{
    my $self=shift;
    my ($tree,$node)=@_;
    if (exists $tree->{NODES}->{$node}->{CHILDREN}){
	return @{$tree->{NODES}->{$node}->{CHILDREN}};
    }
    return ();
}

# return yield of subtree == get_leafs

sub yield{
  my $self=shift;
  return $self->get_leafs(@_);
}

# get all nodes in the subtree

sub subtree_nodes{
    my $self=shift;
    my ($tree,$node)=@_;
    my @subtree=();
    my @children=$self->children($tree,$node);
    foreach my $c (@children){
	push (@subtree,$c);
	push (@subtree,$self->subtree_nodes($tree,$c));
    }
    return @subtree;
}

# get all sister nodes

sub sisters{
    my $self=shift;
    my ($tree,$node)=@_;
    my @sisters=();
    if (exists $tree->{NODES}->{$node}->{PARENTS}){
	foreach my $p (@{$tree->{NODES}->{$node}->{PARENTS}}){
	    foreach my $s (@{$tree->{NODES}->{$p}->{CHILDREN}}){
		if ($node ne $s){
		    push(@sisters,$s);
		}
	    }
	}
    }
    return @sisters;
}


# get neighbor nodes
# $pos gives the distance to the current node
# $pos > 0 ---> right neighbors
# $pos < 0 ---> left neighbors

sub neighbor{
    my ($self,$tree,$node,$pos)=@_;

    # terminal node? --> easy!

    if ($self->is_terminal($tree,$node)){
	my ($start,$end) = $self->subtree_span($tree,$node);
	my $n = $start + $pos;
	
	if ($#{$tree->{TERMINALS}} >= $n-1){
	    return $tree->{TERMINALS}->[$n-1];
	}
    }

    # non-terminals: get left/right neighbors iteratively
    # (this only gives sister nodes dominated by the same parent)
    # (should we also try to move into neighboring sub-trees?)

    else{
	for (0..$pos){

	    if ($pos>0){ $node = $self->right_neighbor($tree,$node); }
	    else {       $node = $self->left_neighbor($tree,$node); }

	    if (! $node){ return undef; }
	    return $node;
	}
    }

    return undef;
}




sub left_neighbor{
    my ($self,$tree,$node)=@_;

    if (exists $tree->{NODES}->{$node}->{LEFTNEIGHBOR}){
	return $tree->{NODES}->{$node}->{LEFTNEIGHBOR};
    }

    my ($parent) = $self->parent($tree,$node);
    if ($parent){
	my @children = $self->children($tree,$parent);
	my $left=undef;
	foreach my $c (@children){
	    if ($c eq $node){
		$tree->{NODES}->{$node}->{LEFTNEIGHBOR} = $left;
		return $left;
	    }
	    $left = $c;
	}
    }
    $tree->{NODES}->{$node}->{LEFTNEIGHBOR} = undef;
    return undef;
}


sub right_neighbor{
    my ($self,$tree,$node)=@_;

    if (exists $tree->{NODES}->{$node}->{RIGHTNEIGHBOR}){
	return $tree->{NODES}->{$node}->{RIGHTNEIGHBOR};
    }

    my ($parent) = $self->parent($tree,$node);
    if ($parent){
	my @children = $self->children($tree,$parent);
	foreach my $c (0..$#children-1){
	    if ($children[$c] eq $node){
		$tree->{NODES}->{$node}->{RIGHTNEIGHBOR} = $children[$c+1];
		return $children[$c+1];
	    }
	}
    }
    $tree->{NODES}->{$node}->{RIGHTNEIGHBOR} = undef;
    return undef;
}




sub is_unary_subtree{
    my $self=shift;
    my ($tree,$node,$child)=@_;
    if (exists $tree->{NODES}){
	if (exists $tree->{NODES}->{$node}){
	    if (exists $tree->{NODES}->{$node}->{CHILDREN}){
		if ($#{$tree->{NODES}->{$node}->{CHILDREN}} == 0){
		    $$child = $tree->{NODES}->{$node}->{CHILDREN}->[0];
		    return 1;
		}
	    }
	}
    }
    return 0;
}

sub get_outside_leafs{
    my $self=shift;
    my ($tree,$node,$attr)=@_;
    $attr = 'word' if (not defined $attr);

    ## check if subtree leafs with the specified attr are already stored
    if (exists($tree->{NODES}->{$node}->{OUTLEAFS})){
	if (exists($tree->{NODES}->{$node}->{OUTLEAFS}->{$attr})){
	    if (ref($tree->{NODES}->{$node}->{OUTLEAFS}->{$attr}) eq 'ARRAY'){
		return @{$tree->{NODES}->{$node}->{OUTLEAFS}->{$attr}};
	    }
	}
	## if we have IDs --> get the attribute from the nodes
	elsif (exists($tree->{NODES}->{$node}->{OUTLEAFS}->{id})){
	    if (ref($tree->{NODES}->{$node}->{OUTLEAFS}->{id}) eq 'ARRAY'){
		my @ids = @{$tree->{NODES}->{$node}->{OUTLEAFS}->{id}};
		my @val=();
		foreach my $i (@ids){
		    push (@val,$tree->{NODES}->{$i}->{$attr});
		}
		return @val;
	    }
	}
    }

    my @leafs=@{$tree->{TERMINALS}};
    my @ids = $self->get_leafs($tree,$node,'id');

    my %inside=();
    foreach (@ids){$inside{$_}=1;}

    my @outside=();
    foreach (@leafs){
	if (!exists($inside{$_})){
	    push(@outside,$tree->{NODES}->{$_}->{$attr});
	}
    }
    ## cache this
    @{$tree->{NODES}->{$node}->{OUTLEAFS}->{$attr}}=@outside;
    return @outside;
}



sub get_nr_leafs{
    my $self=shift;
    my ($tree,$node)=@_;

    if (exists $tree->{NODES}->{$node}){

	# leaf nodes --> just one alone ....
	return 1 if (! exists $tree->{NODES}->{$node}->{CHILDREN});

	# check cached value
	if (exists($tree->{NODES}->{$node}->{NR_LEAFS})){
	    return $tree->{NODES}->{$node}->{NR_LEAFS};
	}
	if (exists($tree->{NODES}->{$node}->{LEAFS})){
	    my ($key,$val)=each %{$tree->{NODES}->{$node}->{LEAFS}};
	    if (ref($tree->{NODES}->{$node}->{LEAFS}->{$key}) eq 'ARRAY'){
		$tree->{NODES}->{$node}->{NR_LEAFS} = 
		    scalar @{$tree->{NODES}->{$node}->{LEAFS}->{$key}};
		return $tree->{NODES}->{$node}->{NR_LEAFS};
	    }
	}
	my @leafs = $self->get_leafs($tree,$node);
	$tree->{NODES}->{$node}->{NR_LEAFS} = scalar @leafs;
	return $tree->{NODES}->{$node}->{NR_LEAFS};
    }
    return 0;
}



sub get_leafs{
    my $self=shift;
    my ($tree,$node,$attr)=@_;
    return () if (ref($tree) ne 'HASH');
    return () if (ref($tree->{NODES}) ne 'HASH');

    $attr = 'word' if (not defined $attr);

    if (exists $tree->{NODES}->{$node}){

	# I am a leaf! --> return my attr
	if (! exists $tree->{NODES}->{$node}->{CHILDREN}){
	    if (exists $tree->{NODES}->{$node}->{$attr}){
		return ($tree->{NODES}->{$node}->{$attr});
	    }
	    return ();
	}

	## check if subtree leafs with the specified attr 
	## are already stored in cache
	elsif (exists($tree->{NODES}->{$node}->{LEAFS})){
	    if (exists($tree->{NODES}->{$node}->{LEAFS}->{$attr})){
		if (ref($tree->{NODES}->{$node}->{LEAFS}->{$attr}) eq 'ARRAY'){
		    return @{$tree->{NODES}->{$node}->{LEAFS}->{$attr}};
		}
	    }
	    ## if we have IDs --> get the attribute from the nodes
	    elsif (exists($tree->{NODES}->{$node}->{LEAFS}->{id})){
		if (ref($tree->{NODES}->{$node}->{LEAFS}->{id}) eq 'ARRAY'){
		    my @ids = @{$tree->{NODES}->{$node}->{LEAFS}->{id}};
		    my @val=();
		    foreach my $i (@ids){
			push (@val,$tree->{NODES}->{$i}->{$attr});
		    }
		    @{$tree->{NODES}->{$node}->{LEAFS}->{$attr}} = @val;
		    return @val;
		}
	    }
	}

	if (ref($tree->{NODES}->{$node}->{CHILDREN}) eq 'ARRAY'){
	    my @leafs=();
	    foreach my $c (@{$tree->{NODES}->{$node}->{CHILDREN}}){
		push(@leafs,$self->get_leafs($tree,$c,$attr));
	    }
	    ## cache subtree leafs ....
	    @{$tree->{NODES}->{$node}->{LEAFS}->{$attr}} = @leafs;
	    $tree->{NODES}->{$node}->{NR_LEAFS} = scalar @leafs;
	    return @leafs;
	}

	print STDERR "WARNING: Something must be wrong here!\n";

	if (exists $tree->{NODES}->{$node}->{$attr}){
	    return ($tree->{NODES}->{$node}->{$attr});
	}
    }
    return ();
}


sub subtree_span{
    my $self=shift;
    my ($tree,$node)=@_;
    if (exists $tree->{NODES}->{$node}->{begin}){
	if (exists $tree->{NODES}->{$node}->{end}){
	    return ($tree->{NODES}->{$node}->{begin},
		    $tree->{NODES}->{$node}->{end});
	}
    }

    my @leafs = $self->get_leafs($tree,$node,'id');
    my %hash=();
    foreach (@leafs){$hash{$_}=1;}
    my $start=9999999;
    my $end=0;
    foreach (0..$#{$tree->{TERMINALS}}){
	if (exists $hash{$tree->{TERMINALS}->[$_]}){
	    if ($_<$start-1){$start = $_+1;}
	    if ($_>=$end){$end = $_+1;}
	}
    }
    if ($start<9999999 && $end>0){
	$tree->{NODES}->{$node}->{begin} = $start;
	$tree->{NODES}->{$node}->{end} = $end;
	return ($start,$end);
    }
    print STDERR "Strange? no start & end of the tree-span? ('$node' $start-$end)\n";
    return ();
}

sub print_sentence{
    my $self=shift;
    return $self->print_tree(@_);
}

sub print_tree{
    my $self=shift;
    my $tree=shift;
    return '';
}

1;
__END__


=head1 NAME

Lingua::Align::Corpus::Treebank - Factory class for reading treebanks

=head1 SYNOPSIS

    my $treebank = new Lingua::Align::Corpus::Treebank(-file => $corpusfile,
                                                       -type => 'TigerXML');

  my %tree=();
  while ($treebank->next_sentence(\%tree)){
    print $treebank->print_sentence(\%tree);
    print "\n";
  }

=head1 DESCRIPTION

Factory class of modules for reading treebanks in different formats. The default format is the Penn Treebank format. Other supported formats are the format produced by the Berkeley parser, the Stanford parser (including typed dependencies), TigerXML and Alpino XML.

=head2 EXPORT

=head1 SEE ALSO

L<Lingua::Align::Corpus>
L<Lingua::Align::Corpus::Treebank::Penn>
L<Lingua::Align::Corpus::Treebank::Berkeley>
L<Lingua::Align::Corpus::Treebank::Stanford>
L<Lingua::Align::Corpus::Treebank::TigerXML>
L<Lingua::Align::Corpus::Treebank::AlpinoXML>


=head1 AUTHOR

Joerg Tiedemann

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2009 by Joerg Tiedemann

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.8 or,
at your option, any later version of Perl 5 you may have available.


=cut