The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# Before `make install' is performed this script should be runnable with
# `make test'. After `make install' it should work as `perl test.pl'

#########################

# change 'tests => 1' to 'tests => last_test_to_print';
use Test::More tests => 12;
use Lingua::EN::Tagger;

ok('Lingua::EN::Tagger', 'module compiled'); # If we made it this far, we're ok.

#########################

# Insert your test code below, the Test module is use()ed here so read
# its man page ( perldoc Test ) for help writing this test script.



######################################
# Start by creating the parser object
# (without the stemmer)
######################################
ok( $parser = Lingua::EN::Tagger->new( stem => 0, weight_noun_phrases => 0, longest_noun_phrase => 15 ), 'creating parser object' );
$tagged = $parser->add_tags( penn() );

ok( %words = $parser->get_words( penn() ), 'get_words() method' );
$accuracy = compute_accuracy( \%words, np_benchmark() );
is( $accuracy, '100', "accuracy of np extraction ($accuracy%)" );

##############################################
# Test the extraction of maximal noun phrases
##############################################
ok( %max_noun_phrases = $parser->get_max_noun_phrases( $tagged ), 'extract MNPs' );
$accuracy = compute_accuracy( \%max_noun_phrases, mnp_benchmark() );
is( $accuracy, '100', "accuracy of mnp extraction ($accuracy%)" );


##############################################
# Test the extraction of all noun phrases
##############################################
ok( %noun_phrases = $parser->get_noun_phrases( $tagged ), 'extract noun phrases' );
$accuracy = compute_accuracy( \%noun_phrases, np_benchmark() );
is( $accuracy, '100', "accuracy of np extraction ($accuracy%)" );

##############################################
# Test the extraction of all nouns
##############################################
ok( %nouns = $parser->get_nouns( $tagged ), 'extract nouns' );
$accuracy = compute_accuracy( \%nouns, noun_benchmark() );
is( $accuracy, '100', "accuracy of noun extraction ($accuracy%)" );


##############################################
# Test the extraction of proper nouns
##############################################
ok( %nnp = $parser->get_proper_nouns( $tagged ), 'extract proper nouns' );
$accuracy = compute_accuracy( \%nnp, nnp_benchmark() );
is( $accuracy, '100', "accuracy of nnp extraction ($accuracy%)" );


sub compute_accuracy {
        ( $hash_ref, $benchmark ) = @_;
        ( $errors, $i ) = ( 0 )x2;
        foreach( keys %{ $hash_ref } ){
                $i++;
                unless( defined $benchmark->{$_} ){
                        # warn "$_ not in benchmark\n";
                        $errors++, 
                        next;
                }
                $i++;
                unless ( $hash_ref->{$_} == $benchmark->{$_} ){
                        # warn $hash_ref->{$_}." != ".$benchmark->{$_}." (benchmark)\n";
                        $errors++;
                }
        }
        foreach( keys %{ $benchmark } ){
                $i++;
                unless( defined $hash_ref->{$_} ){
                        # warn "$_ not defined in extraction\n";
                        $errors++;
                }
        }
        return sprintf( "%d", 100 * ( 1 - $errors / $i ) );
}

sub mnp_benchmark {
        $hash_ref = { 'lisa raines' => 1,
                        'lawyer' => 1,
                        'director of government relations for the industrial biotechnical association' => 1,
                        'judge' => 1,
                        'patent law' => 1,
                        'concerns of research-based industries' => 1,
                        'judge newman' => 1,
                        'former patent lawyer' => 1,
                        'dissent' => 1,
                        'court' => 1,
                        'motion for a rehearing of the case by the full court' => 1,
                        'panel' => 1,
                        'judicial legislation' => 1,
                        'important high-technological industry' => 1,
                        'regard' => 1,
                        'consequences for research' => 1,
                        'innovation' => 1,
                        'public interest' => 1,
                        'ms. raines' => 1,
                        'judgement' => 1,
                        'concern that the absence of patent lawyers on the court' => 1
                };
        return $hash_ref;
}

sub noun_benchmark {
        $hash_ref = { 'lisa' => 1,
                        'raines' => 2,
                        'lawyer' => 2,
                        'director' => 1,
                        'relations' => 1,
                        'government' => 1,
                        'association' => 1,
                        'judge' => 2,
                        'patent' => 3,
                        'law' => 1,
                        'concerns' => 1,
                        'industries' => 1,
                        'newman' => 1,
                        'dissent' => 1,
                        'court' => 3,
                        'motion' => 1,
                        'rehearing' => 1,
                        'case' => 1,
                        'panel' => 1,
                        'legislation' => 1,
                        'industry' => 1,
                        'regard' => 1,
                        'consequences' => 1,
                        'research' => 1,
                        'innovation' => 1,
                        'interest' => 1,
                        'ms.' => 1,
                        'judgement' => 1,
                        'concern' => 1,
                        'industrial' => 1,
                        'biotechnical' => 1,
                        'absence' => 1,
                        'lawyers' => 1
                };
        return $hash_ref;
}

sub np_benchmark {
        $hash_ref = { 'lisa' => 1,
                        'raines' => 2,
                        'lawyer' => 2,
                        'director' => 1,
                        'relations' => 1,
                        'government' => 1,
                        'association' => 1,
                        'judge' => 2,
                        'patent' => 3,
                        'law' => 1,
                        'concerns' => 1,
                        'industries' => 1,
                        'newman' => 1,
                        'dissent' => 1,
                        'court' => 3,
                        'motion' => 1,
                        'rehearing' => 1,
                        'case' => 1,
                        'panel' => 1,
                        'legislation' => 1,
                        'industry' => 1,
                        'regard' => 1,
                        'consequences' => 1,
                        'research' => 1,
                        'innovation' => 1,
                        'interest' => 1,
                        'ms.' => 1,
                        'judgement' => 1,
                        'concern' => 1,
                        'industrial' => 1,
                        'biotechnical' => 1,
                        'absence' => 1,
                        'lawyers' => 1,
                        'lisa raines' => 1,
                        'director of government relations for the industrial biotechnical association' => 1,
                        'patent law' => 1,
                        'concerns of research-based industries' => 1,
                        'judge newman' => 1,
                        'former patent lawyer' => 1,
                        'motion for a rehearing of the case by the full court' => 1,
                        'judicial legislation' => 1,
                        'important high-technological industry' => 1,
                        'consequences for research' => 1,
                        'public interest' => 1,
                        'ms. raines' => 1,
                        'concern that the absence of patent lawyers on the court' => 1,
                        'government relations' => 1,
                        'industrial biotechnical association' => 1,
                        'biotechnical association' => 1,
                        'research-based industries' => 1,
                        'patent lawyer' => 1,
                        'full court' => 1,
                        'high-technological industry' => 1,
                        'patent lawyers' => 1
                };      
        return $hash_ref;

}
sub nnp_benchmark {
	$hash_ref = { 'lisa raines' => 1,
			'industrial biotechnical association' => 1,
			'judge newman' => 1,
			'ms. raines' => 1
		};
	return $hash_ref;
}

sub words_benchmark {
}

#       Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome." 
                        



###############################################
# Words that mostly don't occur in the lexicon
###############################################
sub jibberish {
        return "Nils occludes the 5 corybantic sciolists from fressing upon the
        northeast-oriented perambulations of the yabbering doyenne";
}


##########################################################
# Hyphenated words that mostly don't occur in the lexicon
##########################################################
sub hyphen {
        # brother-in-law not in lexicon, sister-in-law is
        return "The brother-in-law. The sister-in-law. A strategy of tit-for-tat among
        middle-eastern states.";
}



####################################################
# Test the tagger against an actual tagged corpus
####################################################
sub penn { 
        return <<PENN 
        Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome." 
PENN
}