package Search::QueryBuilder;

use 5.008007;
use strict;
use warnings;
use Data::Dumper;

require Exporter;

our @ISA = qw(Exporter);

our %EXPORT_TAGS = ( 'all' => [ qw( testme getTokenizedString tokenizeString 
	
) ] );

our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );

our @EXPORT = qw(
	
);

our $VERSION = '0.01';


# Preloaded methods go here.

sub new {
	my $package = shift;
 	my $self= {
           _booleantags=> undef,
                 };
	#return bless({}, $package);
	return bless ($self,$package);
}

sub tags{
       my ( $self, @tags) = @_;
	   my @defaulttags=("AND","OR","NOT");
       @{$self->{_booleantags}} = @tags if @tags ;
       if( defined(@{$self->{_booleantags}})) {
		   return @{$self->{_booleantags}};
	   } else { 
		   return @defaulttags;
	   };
}


sub getTokenizedString {
	my ($self,$query)=@_;

	my @temp;
	my @temp2;
	my @temp3;
	my @tagbag=$self->tags;
	push(@temp2,tokenizeString($query,@temp));
	#temp2 currently represents a tokenized string

	my $previous="";
	# this cleans out most obvious mistakes 

	# Uppercase the tagbag tags...
	for(my $i=0;$i<$#temp2;$i++){
		my $test=uc($temp2[$i]);
		if((grep /^$test$/,@tagbag)>0){
			$temp2[$i]=uc($temp2[$i]);
			
		}
	}
	foreach my $tempvar (@temp2){
		# get rid of duplicates
		if($previous eq $tempvar){
			
		# or multiple commands (ie AND AND or NOT AND)
		} elsif((grep /^$previous$/,@tagbag)>0 && (grep /^$tempvar$/,@tagbag)>0){
		
		}else {
			push(@temp3,$tempvar);
		}
		$previous=$tempvar;
	}

	# Look for and remove dangling AND OR and NOT
	my $poss=($temp3[$#temp3]);
	while((grep /^$poss$/, @tagbag) >0){
		# Remove ands ors and nots from the end, where they are a bit meaningless 
		pop(@temp3);
		$poss=($temp3[$#temp3]);
	}
	
	return @temp3;
}

sub testme {
	#my $myfoo="    Bah     FOOOO GRAH BLITHER    ";
	#print ltrim($myfoo)."\n";
	#print rtrim($myfoo)."\n";
	#print atrim($myfoo)."\n";
	#print removeAll($myfoo, "A")."\n";
	#print findNearestPrevious("I am a quite long string",12,'q')."\n";
	#print tokenizeString("I am a \"fish\" and so are you")."\n";
	#print tokenizeString("I am a \"fish and so are you")."\n";
	#print tokenizeString("I am a -\"fish +\"and so\" are you")."\n";
	#print tokenizeString("I am a +\"fish +\"and so\" are you")."\n";
	#print tokenizeString("-I +am a fish and so are you too ")."\n";
	#print tokenizeString("I am a -\"fish\"and so\" are you")."\n";
	my @temp;
	my @temp2;
	push(@temp2,tokenizeString("  +\"I am\" a -\"fishy character\" and so\" is Bob",@temp));
	#print "Result: ".Data::Dumper->Dump([@temp2])."\n";
	print "Result: ".join(" ",@temp2)."\n";
}
sub atrim {
	my $string = shift;
	$string =~ s/^\s+//;
	$string =~ s/\s+$//;
	return $string;
}
# Left trim function to remove leading whitespace
sub ltrim {
	my $string = shift;
	$string =~ s/^\s+//;
	return $string;
}
# Right trim function to remove trailing whitespace
sub rtrim {
	my $string = shift;
	$string =~ s/\s+$//;
	return $string;
}

sub removeAll{
	my ($source,$replaceme)=@_;	
	$source=~s/$replaceme//g;
	return $source;
}

sub findNearestPrevious {
	my ($string, $currentidx,$char)=@_;
	my $tmpvar=$currentidx;
	my @charsinstring=split(//,$string);
	while($tmpvar>-1){
		if($charsinstring[$tmpvar] eq $char){
			return $tmpvar;
		}
		$tmpvar--;
	}
	return $tmpvar;
	
}

sub tokenizeString {
	my ($query,@response)=@_;
	$query=removeAll($query,"\'");
	atrim($query);
	my @tempresponse;
	if($query eq ""){
		return @response;
	}
	if(index($query,"\"")<0){
		# Oh jolly good, no quotation marks
		my @splitterms=split(/ /,$query);
	
		foreach my $termlet (@splitterms){
			if(substr($termlet,0,1) eq '-'){
				push(@tempresponse,"NOT");
				push(@tempresponse,substr($termlet,1,length($termlet)));

			} elsif (substr($termlet,0,1) eq '+'){
				push(@tempresponse,"AND");
				push(@tempresponse,substr($termlet,1,length($termlet)));
			} else { 
				push(@tempresponse,$termlet);
			}
			#print "Current contents: ".join(Data::Dumper->Dump([@response]),",");
			# print "Current contents: ".join(",",@response)."\n";
		}
		
	} else {
		# bugger. We have quotation marks - repeat, we have quotation marks
		my $firstIndex=index($query,"\"");
		my $secondIndex=index($query,"\"",$firstIndex+1);
		my $testVar=$secondIndex-$firstIndex;
		if($testVar>-1 && $testVar<2){ # empty quotes?! - sod it
	 	} elsif($testVar<0){ # ... lone lost little quote in middle of nowhere. Put it out of misery
			$query=removeAll($query,"\"");
			push(@tempresponse,tokenizeString($query,@response));
		} elsif ($firstIndex<1){ # first quote at beginning of string...
			push(@tempresponse, substr($query,$firstIndex+1,$testVar-1));
			if($secondIndex<length($query)){
				#push(@response,tokenizeString(substr($query,$secondIndex+1,length($query)-$secondIndex+1),@response));
				push(@tempresponse,tokenizeString(substr($query,$secondIndex+1,length($query)-$secondIndex+1),@response));
			}
		} else { # first quote not at beginning of string. First quote somewhere random
			my $firstminusone=$firstIndex-1;
			if(substr($query,$firstIndex-1,1)  eq " "){ 
					# this is fine for most instances, but sometimes there's a - or a + in the way
					# deal with the most instances first
				push(@tempresponse,tokenizeString(substr($query,0,$firstIndex)));
				push(@tempresponse,substr($query,$firstIndex+1,$testVar-1));
				if($secondIndex<length($query)){
					push(@tempresponse,tokenizeString(substr($query,$secondIndex+1,length($query)-$secondIndex)));
				}

			} else { # there' s a - or + before the "!! the (*&£$(*&!'s!
				my $thirdIndex=findNearestPrevious($query,$firstIndex," ");

				if($thirdIndex<0){ # no space start of query
					if(substr($query,0,1) eq "-"){
						push(@tempresponse,"NOT");
						push(@tempresponse,substr($query,2,$testVar-1));
					} elsif(substr($query,0,1) eq "+"){
						push(@tempresponse,"AND");
						push(@tempresponse,substr($query,2,$testVar-1));
					} else {
						push(@tempresponse,substr($query,0,$testVar-1));
						
					}

						
				} else { # there's a - or + before the ", and we are not at the start of the string...
					# push(@response,substr($query,0,$thirdIndex));
					push(@tempresponse,tokenizeString(substr($query,0,$thirdIndex),@response));
					if(substr($query,$thirdIndex+1,1) eq '-'){ # oh look, a -
						push(@tempresponse,"NOT");
						$thirdIndex++;
					}elsif(substr($query,$thirdIndex+1,1) eq '+'){
						push(@tempresponse,"AND");
						$thirdIndex++;
					}	
					push(@tempresponse,substr($query,$thirdIndex+2,$secondIndex-$thirdIndex-2));
					
					
				}
				if($secondIndex<length($query)){
					# yet more to play with?
					push(@tempresponse,tokenizeString(substr($query,$secondIndex+2,length($query)-$secondIndex),@response));
				
			
				}
			}



		}


	}
	return @tempresponse;
}

sub build{
	my $self = shift;
	return;
}

1;
__END__
# Below is stub documentation for your module. You'd better edit it!

=head1 NAME

Search::QueryBuilder - Perl extension for tokenising search strings and building queries

=head1 SYNOPSIS

  use Search::QueryBuilder;
  my $querybuild=new QueryBuilder;  
  # $querystring contains a query along the lines of "A key phrase" -"keyword" AND keyword +keyword
  my @responsearr=$querybuild->getTokenizedString($querystring);
  # @responsearr contains the keywords and phrases, plus booleans where provided
  
  If you intend to use more than the basic booleans, add them as follows:
  my @mytaglist=("AND","OR","NOT","XOR");
  $querybuild->tags(@mytaglist);

=head1 DESCRIPTION

QueryBuilder is a very simple tokeniser, designed to decode a query string 
into something you can create a database query from (into SQL, Cheshire, that 
sort of thing).

It tries to do a little sanity checking, but is still in active development 
so expect changes. The creation of queries from the resulting string is 
partly merged into the module, but expect this interface to change.

=head2 EXPORT

None by default.


=head1 SEE ALSO

There are actually several other modules out there that are related - 
for example, String::Tokeniser, String::Tokenizer, Parse::Tokens and 
Text::Tokenizer. 

This one is designed specifically for use as part of a search system, and is 
likely to evolve in that direction (handling of prefixes, building up search 
queries in various formats and so forth).

Other approaches to this include Search::Circa and Search::QueryParser. 


=head1 AUTHOR

E Tonkin, E<lt>cselt@users.sourceforge.net<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006 by E Tonkin 

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.7 or,
at your option, any later version of Perl 5 you may have available.


=cut