The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
#
# Copyright (c) 2006 Peter Karman - perl@peknet.com
#
# mostly from the SWISH::API man page
# plus Search::Tools stuff
#

use strict;
use warnings;

use FindBin;
use lib "$FindBin::Bin/../lib";

use POSIX qw(locale_h);
use locale;

use Carp;
use Data::Dump qw( dump );
use SWISH::API::Object;

use Search::Tools;
use Search::Tools::UTF8;
use Getopt::Long;
use Text::Wrap;
use File::Basename;
use Time::HiRes;

binmode STDOUT, ':utf8';

my %skip       = ();                # properties to skip in output
my $debug      = 0;
my $high       = undef;
my $low        = undef;
my $property   = undef;
my $i          = 'index.swish-e';
my $maxresults = undef;
my $help       = 0;
my $col        = 20;                # width of gutter between props and text
my $script     = basename($0);
my ($charset) = (setlocale(LC_CTYPE) =~ m/^.+?\.(.+)/);
$charset ||= 'iso-8859-1';
my $interactive = 0;

my $usage = <<HELP;

    $script [opts] query
    
    $script is a Perl-based debugger for Swish-e indexes.
    
    $script will dump all properties for matches to 'query'.
    
    In addition, long properties will be snipped
    and highlighted using the Search::Tools modules.

    $script is NOT a replacement for the swish-e tool. It
    is an example of using SWISH::API::Stat and Search::Tools
    together.
    
    $script output will all be in UTF-8 regardless of how
    the properties are stored in the Swish-e index. The current
    LOCALE env settings will be applied to non UTF-8 strings when
    converting to UTF-8.
    
    Options:
    
     --index name       specify index name (default: $i)
     --debug [n]        turn on debugging
     --property propname
                        limit results by 'propname'
     --high val         with --property, set high limit
     --low  val         with --property, set low limit
     --max  n           maximum number of results to print
     --charset charset  convert properties to UTF8 from 'charset' ($charset)
     --help             print this message
     --interactive      make successive queries
     
HELP

GetOptions(
           'debug:i'     => \$debug,
           'index=s'     => \$i,
           'high=s'      => \$high,
           'low=s'       => \$low,
           'property=s'  => \$property,
           'max=i'       => \$maxresults,
           'help'        => \$help,
           'charset'     => \$charset,
           'interactive' => \$interactive,
          )
  or die $usage;

die $usage if $help;

my $trans = Search::Tools::Transliterate->new;

my $swish = open_index($i);

# ignoreWordCount not always on
#$swish->RankScheme( 1 );

if ($interactive)
{
    while (1)
    {
        print "swish> ";
        my $q = <STDIN>;
        chomp($q) if $q;
        search($q);
        print "\n";
    }

}
else
{
    die $usage unless @ARGV;

    search(join(' ', @ARGV));

}

#####################################################

sub open_index
{
    my $i     = shift;
    my $swish = SWISH::API::Object->new(indexes => "$i", log => *{STDERR});

    #my $swish = SWISH::API->new("$i");

    if ($swish->Error)
    {
        print $usage;
        $swish->AbortLastError;
    }
    return $swish;
}

sub search
{
    my $q = shift;

    my $start  = [Time::HiRes::gettimeofday()];
    my $search = $swish->New_Search_Object;

    if ($property)
    {
        $search->SetSearchLimit($property, $low, $high);
        $swish->AbortLastError if $swish->Error;
    }

    #carp dump $search;

    # then in a loop
    my $results = $search->Execute($q);

    # always check for errors (but aborting is not always necessary)

    $swish->AbortLastError
      if $swish->Error;

    # Display a list of results

    my $hits  = $results->Hits;
    my $limit = $maxresults || $hits;

    if (!$hits)
    {
        print "No Results\n";
    }
    else
    {

        print "Found $hits hits\n";
        print "Search time: ";
        printf("%0.4f sec\n",
               Time::HiRes::tv_interval($start, [Time::HiRes::gettimeofday()]));

        my $start_display = [Time::HiRes::gettimeofday()];

        my $wc  = $swish->HeaderValue($i, 'WordCharacters');
        my $igf = $swish->HeaderValue($i, 'IgnoreFirstChar') || '';
        my $igl = $swish->HeaderValue($i, 'IgnoreLastChar') || '';

        my $kwre = Search::Tools->regexp(
            debug   => $debug,
            query   => join(' ', $results->ParsedWords($i)),
            stemmer => $swish->HeaderValue($i, 'Fuzzy Mode') ne 'None'
            ? \&stem
            : undef,
            word_characters   => to_utf8($wc,  $charset),
            ignore_first_char => to_utf8($igf, $charset),
            ignore_last_char  => to_utf8($igl, $charset),
            charset           => $charset,

                                        );
        my $snipper = Search::Tools->snipper(debug => $debug, query => $kwre);

        my $hiliter = Search::Tools->hiliter(
            debug => $debug,
            query => $kwre,
            tty   => 1,
            no_html => 1,   # can screw up objects with ref values
                                            );

        my $fw = $swish->fuzzify($i, $q);

        my @fuzz = $fw->WordList;

        print "fuzzy: ", join(' ', @fuzz), "\n";

        my $stemmed = stem(undef, $q);

        print "stemmed query: $stemmed\n";

        my $count  = 0;
        my $larrow = to_utf8(chr(187));
        my $rarrow = to_utf8(chr(171));

        while (my $result = $results->NextResult)
        {
            last if ++$count > $limit;

            print "~" x 80 . "\n";

          PROP: for my $prop ($swish->props)
            {

                next PROP if exists $skip{$prop};

                my $v = $result->$prop || '';
                unless (ref $v)
                {
                    $v = to_utf8($v);
                }
                else
                {
                    $v = dump($v);
                }

                if ($prop eq 'swishlastmodified')
                {
                    $v = localtime($v);
                }
                else
                {
                    $v = $snipper->snip($v) if length($v) > 80;
                    $v = $hiliter->light($v);
                }

                # Text::Wrap has a undesirable effect of indenting on right side
                # the same amount as left, so we hack around that for prettier printing

                my $space   = ' ' x ($col - length($prop));
                my $gutter  = ' ' x $col;
                my $wrapped = wrap("", "", $v);
                $wrapped =~ s,\n,\n$gutter,g;
                print($prop, $space, $larrow, $wrapped, $rarrow, "\n");

            }

        }

        print "Render time: ";
        printf(
               "%0.4f sec\n",
               Time::HiRes::tv_interval(
                                        $start_display,
                                        [Time::HiRes::gettimeofday()]
                                       )
              );

    }

}

# this function also passed as stemmer param to S::T::KeyWords
sub stem
{
    if ($SWISH::API::VERSION < 0.04)
    {
        die "stem() requires SWISH::API version 0.04 or newer\n";
    }

    my $kwobj = shift;
    my $w     = shift;

    my $fw = $swish->Fuzzify($i, $w);

    my @fuzz = $fw->WordList;

    if (my $e = $fw->WordError)
    {

        warn "Error in Fuzzy WordList ($e): $!\n";
        return undef;

    }

    return $fuzz[0];    # we ignore possible doublemetaphone

}