package Regexp::Match::List;
# $Id: List.pm,v 1.1.1.1.8.3 2004/04/29 01:45:31 dgrant Exp $
# IDEA: allow match() to skip regexps below a certain hitrate.
# IDEA: use qr// to precompile regexps
use strict;
use warnings;
use base qw( Class::Base );
use Data::Sorting qw( :basics :arrays );
use Data::Dumper;
use vars qw($VERSION %CONF);
$VERSION = 0.50;
%CONF =
# CONFIGURATION -- This configuation is loaded into $self via load_args()
(
# INTERNAL DEFAULTS (can be touched externally)
USESTUDY => 1, # use "study STRING;" for regexp strings
OPCHECK => 50, # Num of match() calls before calling optimize()
OPSKIP => 0, # Skip optimize() ?
OPWEIGHT => 1, # Default regexp hit weight
OPHITS => 0, # Default regexp hits
OPSORTCONF => # Data::Sorting Sort Rules. Used in optimize()
[ # The hashlike syntax is to get around some issue
# in Data::Sorting that wouldn't let me use a hashref
-compare => 'numeric',
-order => 'reverse',
-sortkey => sub { $_[0]->{'hits'} * $_[0]->{'weight'} }
],
# INTERNAL STRUCTURE (cannot be touched externally)
'_RE' => [], # Store regexps in arrayref. See add()
'_COUNT' => # Number of times a function has been called
{
match => 0,
optimize => 0
},
);
sub match($$)
# PUBLIC METHOD
# Test a string for all available regular expressions.
#
{
my $self = shift;
my ($string) = @_;
my ($RE, $test, @results);
# A possible regexp optimization. see % perldoc -f study
study $string if ($self->{'USESTUDY'});
REGEXP:
for my $i (0..$#{ $self->{'_RE'} })
# Iterate through all regular expressions.
# This uses a for() b/c it allows for more control
# than Set::Array::foreach() (we can escape on a match)
{
$self->_increment(); # $self->{'_COUNT'}{'match'}++
$self->optimize(); # which is used by optimize()
$RE = $self->{'_RE'}->[$i]; # The current regular expression
# Execute the regular expression in list context and
# store the results ($1 .. $n) in an array
@results = ($string =~ $RE->{'test'});
$self->debug("STRING:$string\n");
$self->debug("TEST:$RE->{'test'}\n");
$self->debug("RESULTS:", (scalar(@results)), '-', join(',', @results), "\n\n");
if ($RE->callback(@results))
# A successful match may not be enough for a positive
# result depending on the outcome of the callback which
# is entirely out of Regexp::Match::List's control.
# When it is, we acknowledge and reward a successful
# regular expression, then bust out of this hellish loop.
{
$RE->count_hit(); # $RE->{'hits'}++
last REGEXP; # Bust out
}
}
#print Dumper($RE, @results);
return ($results[0])
? ($RE, @results)
: ();
}
sub add(\%)
# PUBLIC METHOD
# Add a regular expression to the mix.
# IN: (scalar) regular expression w/o '/' (i.e. '^.+?\s$')
# [(scalar) multiplier for hits, used by optimize() ]
# OUT: Whatever Set::Array::push() returns
{
my $self = shift;
my %re = @_;
$self->check_re_conf(\%re);
$re{'weight'} ||= $self->{'OPWEIGHT'};
$re{'hits'} ||= $self->{'OPHITS'};
push (@{ $self->{'_RE'} }, Regexp::Match::List::RE->create(%re));
}
sub check_re_conf(\%)
# Determine whether the given hashref contains all the information
# required to create a regexp entry in $self->{'RE'}
# TODO: complete check_re_conf()
{
my $self = shift;
return 1;
}
sub optimize()
# PUBLIC METHOD, USED INTERNALLY
# Sort Set::Array object of regular expressions by # of times
# match() is called. This will run only when match() has been called
# a multiple of $self->{'OPCHECK'} times
{
my $self = shift;
my $cnt_match = $self->_count('match');
# We only optimize when...
return if (
# we are told allowed to, and when...
($self->{'OPSKIP'} == 1) ||
# the iteration counter reaches a multiple of $self->{'OPCHECK'}
(($cnt_match % $self->{'OPCHECK'}) > 1)
);
# Count up a hit for this function only when we actually resort
# This information is only useful for reference
$self->_increment(); # $self->{'_COUNT'}{'optimize'}++
$self->debug("optimize(): running at match() call #$cnt_match\n\n");
# Sort using Data::Sorting. $self->{'OPSORTCONF'} contains a
# sort rule configuration.
sort_arrayref($self->{'_RE'}, @{ $self->{'OPSORTCONF'} });
}
# EXTREMELY PRIVATE METHODS
# Haha. Philstrdamous, I know you love this one.
# Increments a counter by one. The particular counter is determined
# by the calling function. i.e. $self->{'_COUNT'}{'optimize'}++
sub _increment() { $_[0]->{'_COUNT'}{ (split '::', (caller(1))[3])[3] }++ }
# Returns the value of the counter for the given function
sub _count() { $_[0]->{'_COUNT'}{$_[1]} }
# CONSTRUCTOR RELATED
sub init()
# Rekindle all that we are
{
my ($self, $config) = @_; # Get vars from Class::Base::new()
$self->load_args($config); # Load config into $self
$self->create_attributes(); # Set our attributes and defaults
return $self;
}
sub create_attributes()
# Add internal attributes to $self (does not overwrite existing values)
# AND apply default values to externally setable parameters
{
my $self = shift;
# See %CONF declaration at the top of this file
foreach my $a (keys %CONF)
{
$self->{$a} = $CONF{$a} unless (exists($self->{$a}));
}
return $self;
}
sub load_args($$)
# Used by the constructor to load config into $self.
# NOTE: _ is skipped
{
my ($self, $args) = (shift, shift);
for my $key (keys %{ $args })
{
# Skip values that could overwrite internal attributes
next if $key =~ /^\_/;
(!exists($self->{$key}))
? $self->{$key} = $args->{$key}
: ($self->debug("loadArgs: $key already exists in \$self"));
}
return $self;
}
###############################################################################
# TODO: move into separate module (Regexp::Match::List::RE?)
package Regexp::Match::List::RE;
# A simple object to store a regular expression test and all its matter
sub create()
# A constructor.
# See add()
{
my $class = shift;
my %att = @_;
my $self = {};
bless \%att, $class;
}
# Increment hit tally for this regular expression by user value or 1
# See match()
sub count_hit() { shift->{'hits'} += shift || 1; }
sub callback()
# Run this regular expression's callback if one exists.
# By default we will return the result of the regexp test.
# IN: (array) results ($1 .. $n) of this RE on the current string
# OUT: (bool) success as determined by the callback
#
# See match()
{
my $self = shift;
# If there is no callback, return the test result
return ($#_ >= 0) unless(exists($self->{'callback'}));
# Send the callback the test result as well as a reference to ourself
return &{ $self->{'callback'} }($self, @_);
}
# $Log: List.pm,v $
# Revision 1.1.1.1.8.3 2004/04/29 01:45:31 dgrant
# - Initial preparation for CPAN
#
# Revision 1.1.1.1.8.2 2004/04/23 23:30:25 dgrant
# - Added callback template to Regexp/Match/List.pm
#
# Revision 1.1.1.1.8.1 2004/04/16 17:10:34 dgrant
# - Merging libperl-016 changes into the libperl-1-current trunk
#
# Revision 1.1.1.1.2.1.20.2 2004/04/08 18:23:56 dgrant
# *** empty log message ***
#
# Revision 1.1.1.1.2.1.20.1 2004/04/08 16:42:30 dgrant
# - No significant change
#
# Revision 1.1.1.1.2.1 2004/03/25 01:49:51 dgrant
# - Inital import of List.pm
# - Added cvs Id and Log variables
#
1;
__END__
=head1 NAME
Regexp::Match::List - Matches a string to a list of regular expressions
=head1 SYNOPSIS 1 (short)
my $re = Regexp::Match::List->new(
DEBUG => 1, # share debugging output (caught by Class::Base)
OPCHECK => 100, # how often to reoptimize regexps
OPSKIP => 0, # Skip optimize()?
OPWEIGHT => 1, # default regexp hit multiplier
OPSORTSUB => sub { ... }, # sorting algorithm used by optimize()
);
$re->add('(?i:(trans)(\w\w\w)(tite))', weight => 1.5, hits => 0, somekey => somevalue );
# $RE contains the configured regular expression that successfully matched
# the string. You have access to $RE->{'weight'}, $RE->{'callback'},
# $RE->{'somekey'}, etc...
# @results contains the m// for paired parentheses. In the example below,
# it would contain ('trans','ves','tite');
my ($RE, @results) = $re->match('transvestite ');
# Callback template:
sub somesub($@)
# This callback is called regardless of whether the regular expression
# matched the string. Returning any true value will tell match() that
# this was a success. Any non-true value will admit failure.
{
my ($RE, @results) = @_;
# ... do something
# here you can add more criteria for a particular match
#
# Here we maintain the same return value that match() would
# return on. Any true value will tell match() this match was
# a smashing success.
return $#results >= 0;
# If we did this, all matches would be considered unsuccessful
# return 0;
}
=head1 DESCRIPTION
Regexp::Match::List matches a string to a list of regular expressions
with callbacks and sorting optimization for large datasets.
Think Regexp::Match::Any with optimization (sort on usage trends, most
popular first -- see Data::Sorting) and expanded functionality.
note: all parameters are stored in an RE object and returned on a positive match
note: the callback is called for every regexp test (successful or not)
so it gets the final say as to whether or not there was a match
note: the callback is given the RE object. (see bottom the example above)
=head1 STABILITY
This module is currently undergoing rapid development and there is much left to
do. This module is beta-quality, although it hasn't been extensively tested
or optimized.
It has been tested only on Solaris 8 (SPARC64).
=head1 KNOWN BUGS
None
=head1 SEE ALSO
Regexp::Match::Any, Regexp::Common, Data::Sorting, Class::Base
=head1 AUTHOR
Delano Mandelbaum, E<lt>horrible<AT>murderer.caE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2004 by Delano Mandelbaum
This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
See http://www.perl.com/perl/misc/Artistic.html
=cut