The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Parse::Tokens;

# $Id: Tokens.pm,v 1.5 2001/11/28 01:14:55 steve Exp $

# Copyright 2000-2001 by Steve McKay. All rights reserved.
# This library is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.

use strict;
use vars qw( $VERSION  );

$VERSION = 0.27;

sub new
{
	my ( $proto, $params ) = @_;
	my $class = ref($proto) || $proto;
	my $self = {
		debug => undef,
		text => undef,
		autoflush => undef,
		loose_paring => undef,
		pre_callback => undef,
		post_callback => undef,
		token_callback => undef,
		ether_callback => undef,
		delimiters => [],
		delim_index => {},
	};
	bless( $self, $class );
	$self->init( $params );
	$self;
}

sub init
{
    my( $self, @args ) = @_;
	no strict 'refs';
	$self->_msg( "Processing initialization arguments." );
	for ( keys %{$args[0]} )
	{
		my $ref = lc $_;
		$self->$ref( $args[0]->{$_} );
	}
	use strict;
}

sub debug
{
	my( $self, @args ) = @_;
	$self->_msg( "Storing 'debug' prefs." );
	$self->{'debug'} = $args[0] if defined $args[0];
	return $self->{'debug'};
}

sub token_callback
{
	my( $self, @args ) = @_;
	$self->_msg( "Storing 'token_callback' prefs." );
	$self->{'token_callback'} = $args[0] if defined $args[0];
	return $self->{'token_callback'};
}

sub ether_callback
{
	my( $self, @args ) = @_;
	$self->_msg( "Storing 'ether_callback' prefs." );
	$self->{'ether_callback'} = $args[0] if defined $args[0];
	return $self->{'ether_callback'};
}

sub pre_callback
{
	my( $self, @args ) = @_;
	$self->_msg( "Storing 'pre_callback' prefs." );
	$self->{'pre_callback'} = $args[0] if defined $args[0];
	return $self->{'pre_callback'};
}

sub post_callback
{
	my( $self, @args ) = @_;
	$self->_msg( "Storing 'post_callback' prefs." );
	$self->{'post_callback'} = $args[0] if defined $args[0];
	return $self->{'post_callback'};
}

sub loose_paring
{
	my( $self, @args ) = @_;
	$self->_msg( "Storing 'loose_paring' prefs." );
	$self->{'loose_paring'} = $args[0] if defined $args[0];
	return $self->{'loose_paring'};
}

sub autoflush
{
	my( $self, @args ) = @_;
	$self->_msg( "Storing 'autoflush' prefs." );
	$self->{'autoflush'} = $args[0] if defined $args[0];
	return $self->{'autoflush'};
}

sub text
{
	my( $self, @args ) = @_;
	$self->_msg( "Storing 'text'." );
	$self->flush();
	$self->{'text'} = $args[0] if defined $args[0];
	return $self->{'text'};
}

sub delimiters
{
	my( $self, @args ) = @_;
	# we currently support both a ref to an array of delims
	# as well as an ref to an array of array refs with delims
	if ( ref($args[0]) eq 'ARRAY' )
	{
		# wipe our existing delimiters
		$self->{'delimiters'} = [];
		# we have multiple arrays
		if( ref($args[0]->[0]) eq 'ARRAY' )
		{
			for( @{$args[0]} )
			{
				$self->push_delimiters( $_ );
			}	
		}
		# we have only this array ref
		else
		{
			$self->push_delimiters( $args[0] );
		}
	}
	return @{$self->{'delimiters'}};
}

*add_delimiters = \&push_delimiters;
sub push_delimiters
{ 
	# add a delim pair (real and quoted) to the delimiters array
	my( $self, @args ) = @_;
	$self->_msg( "Adding delimiter pair." );
	if( ref($args[0]) eq 'ARRAY' )
	{
		push(
			@{$self->{'delimiters'}}, {
				real	=> $args[0],
				quoted	=> [
					quotemeta($args[0]->[0]),
					quotemeta($args[0]->[1])
				]
			}
		);
		$self->{'delim_index'}->{$args[0]->[0]} = $#{$self->{delimiters}};
		$self->{'delim_index'}->{$args[0]->[1]} = $#{$self->{delimiters}};
	}
	else
	{
		warn "Args to push_delimiter not an array reference";
	}
	return 1;
}

sub flush
{
	my( $self ) = @_;
	$self->_msg( "Flushing cached parts." );
	delete $self->{'cache'};
	return 1;
}

sub parse
{
	my( $self, @args ) = @_;
	$self->pre_parse();
	$self->init( $args[0] );
	return unless defined $self->{'text'};
	$self->flush() if $self->{'autoflush'};

	my @delim = $self->delimiters();
	my $match_rex = $self->match_expression( \@delim );

	unless( $self->{'cache'} )
	{
		# parse the text
		$self->_msg( "Data not cached. Parsing text." );
		my @chunk = split( m/$match_rex/s, $self->{'text'} );
		@{$self->{'cache'}} = @chunk;
	}

	$self->_msg( "Processing parsed text parts." );
	my $n = 0;
	while ($n <= $#{$self->{'cache'}})
	{
		# find opening delimiter
		
		# if the first element of the token is the element of a token
		#if ( $self->{cache}->[$n] eq $delim[0]->{real}->[0] || $self->{cache}->[$n] eq $delim[1]->{real}->[0] )
		if ( $self->{'cache'}->[$n] eq $delim[$self->{'delim_index'}->{$self->{'cache'}->[$n]}]->{'real'}->[0] )
		{
			$self->_msg( "Dispatching token." );
			$self->token([
				$self->{'cache'}->[$n],
				$self->{'cache'}->[++$n],
				$self->{'cache'}->[++$n]
			]);
		}

		# or it's just text
		else
		{
			$self->_msg( "Dispatching text." );
			$self->ether( $self->{'cache'}->[$n] );
		}
		$n++
	}
	$self->post_parse();
}

sub match_expression
{
	# construct our token finding regular expression
	my( $self, $delim ) = @_;
	my $rex;
	if( $self->{'loose_paring'} )
	{
		my( @left, @right );
		for( @$delim )
		{
			push( @left, $_->{'quoted'}->[0] );
			push( @right, $_->{'quoted'}->[1] );
		}
		$rex = '('.join('|', @left).')(.*?)('.join('|', @right).')';
	}
	else
	{
		my( @sets );
		for( @$delim )
		{
			push( @sets, qq{($_->{'quoted'}->[0])(.*?)($_->{'quoted'}->[1])} );
		}
		$rex = join( '|', @sets );
	}
	$self->_msg( "Constructed '$rex' pattern matching expression." );
	$self->{'match_expression'} = $rex;
	return $rex;
}

# a token consists of a left-delimiter, the contents, and a right-delimiter
*atom = \&token;
sub token
{
	my( $self, $token ) = @_;
	$self->_msg( "Found token ", join( ', ', @$token ) );
	if( $self->{'token_callback'} )
	{
		$self->_msg( "Dispatching token to callback handler '$self->{'token_callback'}'." );
		no strict 'refs';
		&{$self->{'token_callback'}}( $token );
		use strict;
	}
	else
	{
		$self->_msg( "Consider overriding my 'token' method." );
	}
	return 1;
}

# ether is anything not contained in an atom
sub ether
{
	my( $self, $text ) = @_;
	$self->_msg( "Found text ", $text );
	if( $self->{'ether_callback'} )
	{
		$self->_msg( "Dispatching text to callback handler '$self->{'ether_callback'}'." );
		no strict 'refs';
		&{$self->{'ether_callback'}}( $text );
		use strict;
	}
	else {
		$self->_msg( "Consider overriding my 'ether' method." );
	}
	return 1;
}

# this is called just before parsing begins
sub pre_parse
{
	my( $self ) = @_;
	if( $self->{'pre_callback'} )
	{
		$self->_msg( "Dispatching pre_parse event to callback handler '$self->{'pre_callback'}'." );
		no strict 'refs';
		&{$self->{'pre_callback'}}();
		use strict;
	}
	else
	{
		$self->_msg( "Consider overriding my 'pre_parse' method." );
	}
	return 1;
}


# this is called just after parsing ends
sub post_parse
{
	my( $self ) = @_;
	if( $self->{'post_callback'} )
	{
		$self->_msg( "Dispatching post_parse event to callback handler '$self->{'post_callback'}'." );
		no strict 'refs';
		&{$self->{'post_callback'}}();
		use strict;
	}
	else
	{
		$self->_msg( "Consider overriding my 'post_parse' method." );
	}
	return 1;
}

sub _msg
{
	my( $self, @msg ) = @_;
	if( $self->{'debug'} )
	{
		warn __PACKAGE__, ' - ', @msg;
	}
	return 1;
}

1;

__END__

=head1 NAME

Parse::Tokens - class for parsing text with embedded tokens

=head1 SYNOPSIS

  package MyParser;
  use base 'Parse::Tokens';

  MyParser->new->parse({
      text => q{Hi my name is <? $name ?>.},
      hash => {name=>'John Doe'},
      delimiters => [['<?','?>']],
  });

  # override SUPER::token
  sub token
  {
      my( $self, $token ) = @_;
      # $token->[0] - left bracket
      # $token->[1] - contents
      # $token->[2] - right bracket
      # do something with the token...
  }

  # override SUPER::token
  sub ether
  {
      my( $self, $text ) = @_;
      # do something with the text...
  }


=head1 DESCRIPTION
C<Parse::Tokens> provides a base class for parsing delimited strings from text blocks. Use C<Parse::Tokens> as a base class for your own module or script. Very similar in style to C<HTML::Parser>.



=head1 METHODS

=over 10

=item new()

  Pass parameter as a hash reference.
  Options are specified in the getter/setter methods.


=item flush()

  Flush the template cash.


=item parse()

  Run the parser.


=back



=head1 SETTER/GETTER METHODS

=over 10


=item autoflush()

  Turn on autoflushing causing the template cash (not the text) to be purged before each call to parse();.


=item delimiters()

  Specify delimiters as an array reference pointing to the left and right delimiters. Returns array reference containing two array references of delimiters and escaped delimiters.


=item debug()

  Turn on debug mode. 1 is on, 0 is off.


=item ether_callback()

  Sets/gets the callback code reference for the 'ether' event.


=item loose_paring()

  Allow any combination of delimiters to match. Default is turned of requiring exactly specified pair matches only.


=item post_callback()

  Sets/gets the callback code reference for the 'post_parse' event.


=item pre_callback()

  Sets/gets the callback code reference for the 'pre_parse' event.


=item push_delimiters()

  Add a delimiter pair (array ref) to the list of delimiters.
 

=item text()

  Load text.


=item token_callback()

  Sets/gets the callback code reference for the 'token' event.

=back



=head1 EVENT METHODS

=over 10


=item ether()

  Event method that gets called when non-token text is encountered during parsing.


=item post_parse()

  Event method that gets called after parsing has completed.


=item pre_parse()

  Event method that gets called prior to parsing commencing.


=item token()

  Event method that gets called when a token is encountered during parsing.

=back


=head1 HISTORY

=item 0.26

  Cleanup of internal documentation.

=item 0.25

  Added support for callbacks.
  Improved debug messaging.
  Fixed bug in delimiter assignment.
  Rearranged distribution files.

=item 0.24

  Added sample script and sample data.

=item 0.23

  Fixed pseudo bug relation to regular expression 'o' option.
  Aliased 'add_delimiters' to 'push_delimiters'.
  Misc internal changes.

=item 0.22

  Add push_delimiters method for adding to the delimiter array.

=item 0.21

  Add pre_parse and post_parse methods; add minimal debug message support.

=item 0.20

  Add multi-token support.

=head1 AUTHOR

Steve McKay, steve@colgreen.com

=head1 COPYRIGHT

Copyright 2000-2001 by Steve McKay. All rights reserved.

This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

perl(1).

=cut