The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Text::Annotate::WordScan;
use strict;
use Carp;
our $VERSION;
$VERSION = 0.01_2;

sub scan_words { 
    my ($class, $content, $cbk) = @_;
    ($cbk) or croak "need callback";
    my $it = $class->new_it($content);

    # moving around for speed
    # my @pipe = (\&_findsen, \&_findwords, _gfindphrases(), \&_canon, $cbk);
    my @pipe = (\&_findsen, \&_findwords, \&_precanon, _gfindphrases(), $cbk);
    my $c = $class->pipe_to_code(\@pipe);
    $c->([$content]);
};


sub scan_wordsx { 
    my ($class, $content, $cbk) = @_;
    ($cbk) or croak "need callback";
    my $it = $class->new_it($content);

    # a nest of while(s): semipredicate problem.
    # bah, define "untrue" weird value.

    my @pipe = (\&_findsen, \&_findwords, \&_canon, $cbk);
    my $c = $class->pipe_to_code(\@pipe);
    $c->([$content]);
};

sub new_it { 
    my $class = shift;
    my $content = shift;
    my $self = bless { c => $content }, $class;
};

sub ping {
    my $self = shift;
    return ($self->{c} =~ /\s*([^\!\?\.]+)\s*/sg) ? $1 : ();
};

sub pipe_to_code { 
    my ($self, $pipe) = @_;
    
    my $buf = 'sub {';
    my $n = 0;

    my $vvprev = '$_[0]'; # change for OO

    foreach my $e (@$pipe) { $e = ref $e ? Text::Annotate::WordScan::SubWrap->new($e) : $e->new };

    foreach my $e (@$pipe) {
	my $vvar = '$v'.$n;
	my $debug = my $debug2 = "";
	# my $debug = "warn 'pipesegment $n in:', Dumper($vvprev);";
	# my $debug2 = "warn 'pipesegment $n out:', Dumper($vvar);";
	$buf .= "$debug \$pipe->[$n]->add($vvprev); while (my $vvar = \$pipe->[$n]->iterate) { $debug2; ";
	$vvprev = $vvar;
	$n++;
    }

    $buf .= '}' x (@$pipe+1);
    (my $subref = eval $buf) or die "Didn't like that: $@";
}
    
sub _findsen { 
    my $in = shift; 
    my @out;
    foreach my $e (@$in) { 
	my $tidy = $e;
	$tidy =~ s/\s+/ /sg;
	push @out, split /[\.\?\!]+/, $tidy 
	};
    @out ? \@out : ();
};

sub _findwords {
    my $in = shift;
    my @out;
    foreach my $sen (@$in) { 
	my @words = grep {length $_} (split /[\s\!\?\.]+/, $sen);
	push @out, \@words if @words;
    }
    @out ? \@out : ();
}

# and now, the tricky bit.

# we are simultaneously...
# - splitting into words
# - regrouping those words into potential phrases (tricky bit, as an iterator)
# - confusing outselves totally

# - MAKE SURE WE DON'T LOSE STUFF
# - OO may actually be tidier.

# - This is now not spinning, but not quite working either.

sub _gfindphrases { 
    my @sentences;
    my $phl = 0;
    my $max_phl = 4;
    my $outmax = 10000; # TODO: MAKE CONFIGURABLE
    my $delimiter = "";
    my ($pos, $phraselength) = (0, 0);

    my $s = 
	sub { 
	    my $in = shift;
	    push @sentences, @$in if ($in && @$in);
	    my @out;
	    while (@out < $outmax) { 
		last if (!@sentences); # needed?
		my $foo = @sentences;
		my $max = $pos + $phraselength;
		if ($max >= @{$sentences[0]}) {
		    $pos = 0;
		    $phraselength++;
		    $max = $pos + $phraselength;
		};
		if (($phraselength > $max_phl) || ($phraselength >= @{$sentences[0]})) { 
		    $phraselength = 0;
		    $max = $pos + $phraselength;
		    shift @sentences;
		}
		last if (!@sentences);
		push @out, (join $delimiter, @{$sentences[0]}[$pos++..$max]);
		
	    };
	    return @out ? \@out : ();
	};
    $s;
}


sub _findphrasesold {  # TURN THIS INTO ITERATOR!
    my $data = shift;
    my @out;
    foreach my $sentance (@$data) { 
	my @words = grep {length $_} (split /[\s\!\?\.]+/, $sentance);
	my $i = 0;
	while (1) { 
	    foreach my $phraselength (qw(0 1 2 3 4)) {
		my $max = $i + $phraselength;
		last if ($max >= @words);
		my $foo = join " ", @words[$i..$max];
		push @out, $foo;
	    };
	    $i++;
	    last if ($i > @words); # hmm
	}
    };
    \@out;
};


sub _precanon { 
    my $data = shift;
    my @out;
    foreach my $sentence (@$data) { 
	foreach my $word (@$sentence) { 
	    $word = lc $word;
	    $word =~ s/[^a-z0-9]//sg;
	}
    }
    $data;
};

sub _canon { 
    my $data = shift;
    my @out;
    foreach my $e (@$data) { 
	$e = lc $e;
	$e =~ s/[^a-z0-9]//sg;
	push @out, $e;
    }
    \@out;
};


sub canonicalize_id { 
    my ($self, $word) = @_;
    return _canon([$word])->[0];
};

package Text::Annotate::WordScan::SubWrap;

sub new { 
    my ($class, $sub) = @_;
    $class = ref $class || $class;
    (ref $sub eq "CODE") or die "'$sub' ain't a sub";
    my $self = [$sub];
    bless $self, $class;
};

sub add { 
    my ($self, $data) = @_;
    $self->[1] = $data;
};

# todo: debugging is a problem.

sub iterate { 
    my $self = shift;
    my $fn = $self->[0];
    my $out = [];
    $out = $fn->($self->[1]);
    $self->[1] = undef;
    ($out && @$out) ? $out : ();
};
    
package CGI::Kwiki::WordScan::Buffer;

sub new {
    return bless [];
};

sub add { 
    my ($self, $data) = @_;
    (ref $data eq "ARRAY") or die "nay";
    push @$self, @$data;
};

sub iterate { 
    my $self = shift;
    @$self ? [shift @$self] : ();
};

1;