The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# $Id: TokenParse.pm,v 1.20 2004/08/08 01:20:36 gene Exp $

package Lingua::TokenParse;
$VERSION = '0.1601';
use strict;
use warnings;
use Carp;
use Storable;
use Math::BaseCalc;

sub new {
    my $proto = shift;
    my $class = ref $proto || $proto;
    my $self  = {
        verbose      => 0,
        # The word to parse!
        word         => undef,
        # We need to use this.
        word_length  => 0,
        # Known tokens.
        lexicon      => {},
        # Local lexicon cache file name.
        lexicon_file => '',  # ?: 'lexicon-' . time(),
        # All word parts.
        parts        => [],
        # All possible parts combinations.
        combinations => [],
        # Scored list of the known parts combinations.
        knowns       => {},
        # Definitions of the known and unknown fragments in knowns.
        definitions  => {},
        # Fragment definition separator.
        separator    => ' + ',
        # Known-but-not-defined definition output string.
        not_defined  => '.',
        # Unknown definition output string.
        unknown      => '?',
        # Known trimming regexp rules.
        constraints  => [],
        @_,  # slurp anything else and override defaults.
    };
    bless $self, $class;
    $self->_init();
    return $self;
}

sub _init {
    my $self = shift;
    warn "Entering _init()\n" if $self->{verbose};
    $self->word( $self->{word} ) if $self->{word};
    # Retrieve our lexicon cache if a filename was set.
    $self->lexicon_cache;
}

sub DESTROY {
    my $self = shift;
    # Cache our lexicon if a filename has been given.
    $self->lexicon_cache( $self->{lexicon_file} )
        if $self->{lexicon_file};
}

sub verbose {
    my $self = shift;
    $self->{verbose} = shift if @_;
    return $self->{verbose};
}

sub word {
    # WORD: This method is the only place where word_length is set.
    my $self = shift;
    warn "Entering word()\n" if $self->{verbose};
    if( @_ ) {
        $self->{word} = shift;
        $self->{word_length} = length $self->{word};
        printf "\tword = %s\n\tlength = %d\n",
            $self->{word}, $self->{word_length}
            if $self->{verbose};
    }
    return $self->{word};
}

sub lexicon {
    my $self = shift;
    if( @_ ) {
        $self->{lexicon} = @_ == 1 && ref $_[0] eq 'HASH'
            ? shift
            : @_ % 2 == 0
                ? { @_ }
                : {};
    }
    return $self->{lexicon};
}

sub parts {
    my $self = shift;
    $self->{parts} = shift if @_;
    return $self->{parts};
}

sub combinations {
    my $self = shift;
    $self->{combinations} = shift if @_;
    return $self->{combinations};
}

sub knowns {
    my $self = shift;
    $self->{knowns} = shift if @_;
    return $self->{knowns};
}

sub definitions {
    my $self = shift;
    $self->{definitions} = shift if @_;
    return $self->{definitions};
}

sub separator {
    my $self = shift;
    $self->{separator} = shift if @_;
    return $self->{separator};
}

sub not_defined {
    my $self = shift;
    $self->{not_defined} = shift if @_;
    return $self->{not_defined};
}

sub unknown {
    my $self = shift;
    $self->{unknown} = shift if @_;
    return $self->{unknown};
}

sub constraints {
    my $self = shift;
    $self->{constraints} = shift if @_;
    return $self->{constraints};
}

sub parse {
    my $self = shift;
    warn "Entering parse()\n" if $self->{verbose};
    $self->word( shift ) if @_;
    croak 'No word provided.' unless defined $self->{word};
    croak 'No lexicon defined.' unless keys %{ $self->{lexicon} };
    # Reset our data structures.
    $self->parts([]);
    $self->definitions({});
    $self->combinations([]);
    $self->knowns({});
    # Build new ones based on the word.
    $self->build_parts;
    $self->build_definitions;
    $self->build_combinations;
    $self->build_knowns;
}

sub build_parts {
    my $self = shift;
    warn "Entering build_parts()\n" if $self->{verbose};

    for my $i (0 .. $self->{word_length} - 1) {
        for my $j (1 .. $self->{word_length} - $i) {
            my $part = substr $self->{word}, $i, $j;
            push @{ $self->{parts} }, $part
                unless grep { $part =~ /$_/ }
                    @{ $self->constraints };
        }
    }

    if($self->{verbose}) {
        # XXX This is ugly.
        my $last = 0;
        for my $part (@{ $self->{parts} }) {
            print '',
                ($last ? $last > length( $part ) ? "\n\t" : ', ' : "\t"),
                $part;
            $last = length $part;
        }
        print "\n" if @{ $self->{parts} };
    }
    return $self->{parts};
}

# Save a known combination entry => definition table.
sub build_definitions {
    my $self = shift;
    warn "Entering build_definitions()\n" if $self->{verbose};
    for my $part (@{ $self->{parts} }) {
        $self->{definitions}{$part} = $self->{lexicon}{$part}
            if $self->{lexicon}{$part};
    }
    warn "\t", join( "\n\t", sort keys %{ $self->definitions } ), "\n"
        if $self->{verbose};
    return $self->{definitions};
}

sub build_combinations {
    my $self = shift;
    warn "Entering build_combinations()\n" if $self->{verbose};

    # field size for binary iteration (digits of precision)
    my $y  = $self->{word_length} - 1;
    # total number of zero-based combinations
    my $z  = 2 ** $y - 1;
    # field size for the count
    my $lz = length $z;
    # field size for a combination
    my $m  = $self->{word_length} + $y;
    warn sprintf
        "\tTotal combinations: %d\n\tConstrained combinations:\n",
        $z + 1
        if $self->{verbose};

    # Truth is a single partition character: the lowly dot.
    my $c = Math::BaseCalc->new( digits => [ 0, '.' ] );

    # Build a word part combination for each iteration.
    for my $n ( 0 .. $z ) {
        # Iterate in base two.
        my $i = $c->to_base( $n );

        # Get the binary digits as an array.
        my @i = split //, sprintf( '%0'.$y.'s', $i );

        # Join the character and digit arrays into a partitioned word.
        my $t = '';
        # ..by stepping over the characters and peeling off a digit.
        for( split //, $self->{word} ) {
            # Zero values become ''. Haha!  Truth prevails.
            $t .= $_ . (shift( @i ) || '');
        }

        unless( grep { $t =~ /$_/ } @{ $self->{constraints} } ) {
            # Preach it.
            printf "\t%".$lz.'d) %0'.$y.'s => %'.$m."s\n", $n, $i, $t
                if $self->{verbose};
            push @{ $self->combinations }, $t;
        }
    }

    return $self->{combinations};
}

sub build_knowns {
    my $self = shift;
    return unless scalar keys %{ $self->{lexicon} };
    warn "Entering build_knowns()\n" if $self->{verbose};

    # Save the familiarity value for each "raw" combination.
    for my $combo (@{ $self->{combinations} }) {
        # Skip combinations that have already been seen.
        next if exists $self->{knowns}{$combo};

        my ($sum, $frag_sum, $char_sum) = (0, 0, 0);

        # Get the bits of the combination.
        my @chunks = split /\./, $combo;
        for (@chunks) {
            # XXX Uh.. Magically handle hyphens in lexicon entries.
            ($_, my $combo_seen) = _hyphenate($_, $self->lexicon, 0);

            # Sum the combination familiarity values.
            if ($combo_seen) {
                $frag_sum++;
                $char_sum += length;
            }
        }
# XXX Huh? Why?  Can $_ change or something?
        # Stick our combination back together.
        $combo = join '.', @chunks;

        # Save this combination and its familiarity ratios.
        my $x = $frag_sum / @chunks;
        my $y = $char_sum / $self->{word_length};
        warn "\t$combo: [$x, $y]\n" if $self->{verbose};
        if( $x || $y ) {
            $self->{knowns}{$combo} = [ $x, $y ];
        }
        else {
            delete $self->{knowns}{$combo};
        }
    }

    return $self->{knowns};
}

# Reduce the number of known combinations by concatinating adjacent
# unknowns (and then removing any duplicates produced).

sub learn {
    my ($self, %args) = @_;
    # Get the list of (partially) unknown stem combinations.
    # Loop through each looking in %args or prompting for a definition.
}

# Update the given string with its actual lexicon value and increment
# the seen flag.
sub _hyphenate {
    my ($string, $lexicon, $combo_seen) = @_;

    if (exists $lexicon->{$string}) {
        $combo_seen++ if defined $combo_seen;
    }
    elsif (exists $lexicon->{"-$string"}) {
        $combo_seen++ if defined $combo_seen;
        $string = "-$string";
    }
    elsif (exists $lexicon->{"$string-"}) {
        $combo_seen++ if defined $combo_seen;
        $string = "$string-";
    }

    return wantarray ? ($string, $combo_seen) : $string;
}

sub output_knowns {
    my $self = shift;
    my @out = ();
    my $header = <<HEADER;
Combination [frag familiarity, char familiarity]
Fragment definitions

HEADER

    for my $known (
        reverse sort {
            $self->{knowns}{$a}[0] <=> $self->{knowns}{$b}[0] ||
            $self->{knowns}{$a}[1] <=> $self->{knowns}{$b}[1]
        } keys %{ $self->{knowns} }
    ) {
        my @definition;
        for my $chunk (split /\./, $known) {
            push @definition,
                defined $self->{definitions}{$chunk}
                    ? $self->{definitions}{$chunk}
                        ? $self->{definitions}{$chunk}
                        : $self->{not_defined}
                    : $self->{unknown};
        }

        push @out, sprintf qq/%s [%s]\n%s/,
            $known,
            join (', ', map { sprintf '%0.2f', $_ }
                @{ $self->{knowns}{$known} }),
            join ($self->{separator}, @definition);
    }

    return wantarray ? @out : $header . join "\n\n", @out;
}

# Naive, no locking read/write.  If you run a production environment,
# you know what to do.
sub lexicon_cache {
    my( $self, $file, $value ) = @_;
    warn "Entering lexicon_cache()\n" if $self->{verbose};

    # Set the file and the lexicon_file attribute if we are told to.
    if( $file && $file eq 'lexicon_file' && $value ) {
        $self->{lexicon_file} = $value;
        $file = $value;
    }

    # If there is no file try to use the lexicon_file.
    $file ||= $self->{lexicon_file};
    # Otherwise, bail out!
    warn( "No lexicon cache file set\n" ) and return
        if $self->{verbose} && !$file;

    if( $file ) {
        # Store 'em if you got 'em.
        if( keys %{ $self->{lexicon} } ) {
            warn "store( $self->{lexicon}, $file )\n" if $self->{verbose};
            store( $self->{lexicon}, $file );
        }
        # ..Retrieve 'em if not.
        else {
            warn "retrieve( $file )\n" if $self->{verbose} && -e $file;
            $self->lexicon( retrieve( $file ) ) if -e $file;
        }
    }
}

1;

__END__

=head1 NAME

Lingua::TokenParse - Parse a word into scored, fragment combinations

=head1 SYNOPSIS

  use Lingua::TokenParse;
  my $p = Lingua::TokenParse->new(
    word => 'antidisthoughtlessfulneodeoxyribonucleicfoo',
    lexicon => {
        a    => 'not',
        anti => 'opposite',
        di   => 'two',
        dis  => 'separation',
        eo   => 'hmmmmm',  # etc.
    },
    constraints => [ qr/eo(?:\.|$)/ ], # no parts ending in eo allowed
  );
  print Data::Dumper($p->knowns);

=head1 DESCRIPTION

This class represents a Lingua::TokenParse object and contains
methods to parse a given word into familiar combinations based
on a lexicon of known word parts.  This lexicon is a simple
I<fragment =E<gt> definition> list.

Words like "automobile" and "deoyribonucleic" are composed of
different roots, prefixes, suffixes, etc.  With a lexicon of known
fragments, a word can be partitioned into a list of its (possibly
overlapping) known and unknown fragment combinations.

These combinations can be given a score, which represents a measure of
word familiarity.  This measure is a set of ratios of known to unknown
fragments and letters.

=head1 METHODS

=head2 new

  $p = Lingua::TokenParse->new(
      verbose => 0,
      word => $word,
      lexicon => \%lexicon,
      lexicon_file => $lexicon_file,
      constraints => \@constraints,
  );

Return a new Lingua::TokenParse object.

This method will automatically call the partition methods (detailed
below) if a word and lexicon are provided.

The C<word> can be any string, however, you will want to make sure that
it does not include the same characters you use for the C<separator>,
C<not_defined> and C<unknown> strings (described below).

The C<lexicon> must be a hash reference with word fragments as keys and
definitions as their respective values.

=head2 parse

  $p->parse;
  $p->parse($string);

This method clears the partition lists and then calls all the
individual parsing methods that are detailed below.  If a string
is provided the object's C<word> attribute is reset to that, first.

=head2 build_parts

  $parts = $p->build_parts;

Construct an array reference of the word partitions.

=head2 build_definitions

  $known_definitions = $p->build_definitions;

Construct a table of the definitions of the word parts.

=head2 build_combinations

  $combos = $p->build_combinations;

Compute the array reference of all possible word part combinations.

=head2 build_knowns

  $raw_knowns = $p->build_knowns;

Compute the familiar word part combinations.

This method handles word parts containing prefix and suffix hyphens,
which encode information about what is a syntactically illegal word
combination, which can be used to score (or even throw out bogus
combinations).

=head2 lexicon_cache

  $p->lexicon_cache;
  $p->lexicon_cache( $lexicon_file );
  $p->lexicon_cache( lexicon_file => $lexicon_file );

Backup and retrieve the hash reference of token entries.

If this method is called with no arguments, the object's
C<lexicon_file> is used.  If the method is called with a single
argument, the object's C<lexicon_file> attribute is temporarily
overridden.  If the method is called with two arguments and the first
is the string "lexicon_file" then that attribute is set before
proceeding.

=head1 CONVENIENCE METHOD

=head2 output_knowns

  @known_list = $p->output_knowns;
  print Dumper \@known_list;

  # Look at the "even friendlier output."
  print scalar $p->output_knowns(
      separator   => $separator,
      not_defined => $not_defined,
      unknown     => $unknown,
  );

This method returns the familiar word part combinations in a couple
"human accessible" formats.  Each have familiarity scores rounded to
two decimals and fragment definitions shown in a readable layout

=over 4

=item separator

The the string used between fragment definitions.  Default is a plus
symbol surrounded by single spaces: ' + '.

=item not_defined

Indicates a known fragment that has no definition.  Default is a
single period: '.'.

=item unknown

Indicates an unknown fragment.  The default is the question mark: '?'.

=back

=head1 ACCESSORS

=head2 word

  $p->word($word);
  $word = $p->word;

The actual word to partition which can be any string.

=head2 lexicon

  $p->lexicon(%lexicon);
  $p->lexicon(\%lexicon);
  $lexicon = $p->lexicon;

The lexicon is a hash reference with word fragments as keys and
definitions their respective values.  It can be set with either a
hash or a hash reference.

=head2 parts

  $parts = $p->parts;

The computed array reference of all possible word partitions.

=head2 combinations

  $combinations = $p->combinations;

The computed array reference of all possible word part combinations.

=head2 knowns

  $knowns = $p->knowns;

The computed hash reference of known (non-zero scored) combinations
with their familiarity values.

=head2 definitions

  $definitions = $p->definitions;

The hash reference of the definitions provided for each fragment of
the combinations with the values of unknown fragments set to undef.

=head2 constraints

  $constraints = $p->constraints;
  $p->constraints(\@regexps);

An optional, user defined array reference of regular expressions to
apply when constructing the list of parts and combinations.  This
acts as a negative pruning device, meaning that if a match is
successful, the entry is excluded from the list.

=head1 EXAMPLES

Example code can be found in the distribution C<eg/> directory.

=head1 TO DO

Turn the lame C<output_knowns> method into a sensible XML serializer
(of optionally everything).

Compute the time required for a given parse.

Make a method to add definitions for unknown fragments and call it...
C<learn()>.

Use traditional stemming to trim down the common knowns and see if
the score is the same...

Synthesize a term list based on a thesaurus of word-part definitions.
That is, go in reverse.  Non-trivial!

=head1 SEE ALSO

L<Storable>

L<Math::BaseCalc>

=head1 DEDICATION

For my Grandmother and English teacher Frances Jones.

=head1 THANK YOU

Thank you to Luc St-Louis for helping me increase the speed while
eliminating the exponential memory footprint.  I wish I knew your
email address so I could tell you.  B<lucs++>

=head1 AUTHOR

Gene Boggs E<lt>gene@cpan.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2003-2004 by Gene Boggs

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut