The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package WordLists::Tag::Tagger;
use strict;
use warnings;
use utf8;
use WordLists::Common qw(/generic/);
use Lingua::EN::Tagger;
use WordLists::WordList;
use WordLists::Sense;
use WordLists::Base;
our $VERSION = $WordLists::Base::VERSION;

our $AUTOLOAD;
our @ignore_pos_codes = qw(cd to prp prps sym pp pps ppr lrb rrb ppc ppl );
sub _norm_word($)
{
	my $s = shift;
	$s=~s/\.//g;
	$s=~s/\/.*//;
	$s=~s/['’]s$//;
	$s=~s/['‘’`"“”]$//;
	$s=~s/^['‘’`"“”]//;
	$s=~tr/…–\xA0\*//d;
	return lc $s;
}
sub human_pos($)
{
	my $sPos = shift;
	$sPos =~	s<^nn.*$>
				<noun>;
	$sPos =~	s<^in$>
				<preposition>;
	$sPos =~	s<^to$>
				<preposition>;
	$sPos =~	s<^jj.*$>
				<adjective>;
	$sPos =~	s<^md.*$>
				<modal verb>;
	$sPos =~	s<^vb.*$>
				<verb>;
	$sPos =~	s<^rb.*$>
				<adverb>;
	$sPos =~	s<^det$>
				<determiner>;
	$sPos =~	s<^cc$>
				<conjunction>;
	$sPos =~	s<^wrb$>
				<pronoun>;
	$sPos =~	s<^wdt$>
				<pronoun>;
	$sPos =~	s<^wp$>
				<pronoun>;
	$sPos =~	s<^prp.*$>
				<pronoun>;
	$sPos =~	s<^cd.*$>
				<number>;
	$sPos =~	s<^uh$>
				<interjection>;
	return $sPos;
}


sub new
{
	my ($class,  $args) = @_;
	$args ||={};
	my $self = {
		tagger=>Lingua::EN::Tagger->new(%$args),
	};
	bless ($self, $class);
}

1;

sub add_tags
{
	my ($self, $sMS) = @_;
	my $taggedMS = $self->{tagger}->add_tags( $sMS );
	return $taggedMS;
}

sub add_human_tags
{
	my ($self, $sMS) = @_;
	my  $sMSOUT = '';
	foreach my $sSentence (@{$self->{tagger}->get_sentences($sMS)})
	{
		$sSentence =~ tr/<>&//d;
		my $taggedSentence = $self->{tagger}->add_tags( $sSentence );
		$taggedSentence =~ s`<([a-z]+)>([^<]+)</\1>`human_pos($1) ne $1 ? qq(<span pos=").human_pos($1).qq(">$2</span>) : $2;`ge;
		$sMSOUT .= "<p>$taggedSentence</p> ";
	}
	return $sMSOUT;
}

sub get_wordlist
{
	my ($self, $sUntagged, $args) = @_;
	
	my $wl;
	if (defined $args->{'wl'} and ref $args->{'wl'} eq 'WordLists::WordList')
	{
		$wl=$args->{'wl'};
	}
	else
	{
		$wl = WordLists::WordList->new;
	}
	foreach my $sSentence (@{$self->{tagger}->get_sentences($sUntagged)})
	{
		my $taggedMS;
		$taggedMS = $self->{tagger}->add_tags( $sSentence );
		while ($taggedMS =~ m`<([a-z]+)>([^<]+)</\1>`g)
		{
			#print "\n$1\t$2";
			my $sHW = _norm_word($2);
			my $sPosCode = $1;
			my $bNext;
			foreach (@ignore_pos_codes) #
			{
				if ($sPosCode eq $_)
				{
					$bNext++;
					last;
				}
			}
			next if $bNext;
			my $sPos = human_pos($sPosCode);
			my $sense = WordLists::Sense->new();
			$sense->set_hw($sHW);
			$sense->set_pos($sPos);
			$sense->set_eg($sSentence);
			$sense->set_poscode($sPosCode);
			if (defined $args->{'callback_on_make_sense'} and ref $args->{'callback_on_make_sense'} eq ref sub{})
			{
				&{$args->{'callback_on_make_sense'}}($sense);
			}
			my @senses = $wl->get_senses_for($sense->get_hw, $sense->get_pos);
			if (@senses and !$args->{'keep_repeats'})
			{
				
			}
			else
			{
				if (defined $args->{'test_sense_before_add'} and ref $args->{'test_sense_before_add'} eq ref sub {} and !&{$args->{'test_sense_before_add'}}($sense))
				{
				}
				else
				{
					if (defined $args->{'callback_on_add_sense'} and ref $args->{'callback_on_add_sense'} eq ref sub{})
					{
						&{$args->{'callback_on_add_sense'}}($sense);
					}
					$wl->add_sense($sense);
				}
			}
		};
	}
	return $wl;
}
=pod

=head1 NAME

WordLists::Tag::Tagger

=head1 SYNOPSIS
	
	my $tagger = WordLists::Tag::Tagger->new();
	my $wl = $tagger->get_wordlist('The quick brown fox jumped over the lazy dog');

=head1 DESCRIPTION

Uses L<Lingua::EN::Tagger> to do various things with strings, chielfly to create a L<WordLists::WordList> out of a document, e.g. to use as a basis for a glossary.

=head1 METHODS

=head3 get_wordlist

Uses L<Lingua::EN::Tagger> to create a L<WordLists::WordList> out of a string (e.g. a manuscript).

L<Lingua::EN::Tagger> allows splitting into sentences, and these sentences become C<eg> fields in the L<WordLists::Sense>s generated. 

Only the first instance of each headword / part of speech combination is entered into the list, unless the third argument has a key C<keep_repeats> with a true value.

The fields populated are: C<hw>, C<pos>, C<eg>, and C<poscode>, which is the original part of speech code outputted by the tagger.

The third argument is a hashref which allows you to configure several options.

C<callback_on_add_sense> should be a coderef. It is passed the sense immediately before it is added to the wordlist, e.g. if you want to add a unit. 

C<callback_on_make_sense> should be a coderef. It is passed the sense before the wordlist is tested for inclusion. This is an opportunity to further normalise parts of speech. 

C<keep_repeats> is a flag, which, if set, prevents the code from removing repetitions.

=head1 TODO

C<ignore> is a wordlist or arrayref whose elements are L<WordLists::Sense> objects or plain hashrefs of the form C<< {hw=>'head', pos =>'n'} >>, or headwords as strings. If these words are found, they are not added to the list. (not yet implemented!)

=head1 BUGS

Please use the Github issues tracker.

=head1 LICENSE

Copyright 2011-2012 © Cambridge University Press. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.

=cut