The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
#
# SWISH::Prog-based Swish3 implementation
#
use strict;
use warnings;
use Carp;
use SWISH::Prog;
use SWISH::3;
use Getopt::Long qw(:config no_ignore_case);
use Data::Dump qw( dump );
use Rose::DateTime::Util qw(parse_date);
use Time::HiRes;
use Search::Tools::UTF8;

binmode STDERR, ':utf8';
binmode STDOUT, ':utf8';

my $VERSION = '3.0.16';

my $USAGE = qq{$0
 usage:
    swish3 [-E N] [-i dir file ... ] [-S aggregator] [-c file] [-f invindex] [-l] [-v (num)]
    swish3 -w word1 word2 ... [-f file1 file2 ...] \
          [-P phrase_delimiter] [-p prop1 ...] [-s sortprop1 [asc|desc] ...] \
          [-m num] [-t str] [-d delim] [-H (num)] [-x output_format] \
          [-R rank_scheme] [-L prop low high]
    swish3 -k (char|*) [-f invindex1 invindex2 ...]
    swish3 -M invindex1 ... outputfile
    swish3 -N path/to/compare/file or date
    swish3 -V
    swish3 -h
    swish3 -D n

 options: defaults are in brackets
 # commented options are not yet supported

 indexing options:
   --aggregator_opts : name=value
    -c : configuration file
    -D : Debug mode
   --doc_filter : doc_filter
    -E : next param is total expected files (generates progress bar)
    -f : invindex dir to create or search from [index.swish3]
    -F : next param is invindex format (ks, lucy, xapian, native, or dbi) [native]
    -i : create an index from the specified files
        for "-S fs" - specify a list of files or directories
        for "-S spider" - specify a list of URLs
    -h : print this usage statement
    -l : follow symbolic links when indexing
    -M : merges index files
    -N : index only files with a modification date newer than path or date
         if no argument, defaults to [indexdir/swish_last_start]
    -S : specify which aggregator to use.
        Valid options are:
         "fs" - local files in your File System
         "spider" - web site files using a web crawler
         #"prog"  - use the program API
        The default value is: "fs"
    #-T : Trace options ('-T help' for info)
    -v : indexing verbosity level (0 to 3) [-v 1]
    -W : next param is ParserWarnLevel [-W 2]

 search options:
    -b : begin results at this number
    #-d : next param is delimiter.
    #-E : Append errors to file specified, or stderr if file not specified.
    #-e : "Economic Mode": The index proccess uses less RAM.
    -f : invindex dir to create or search from [index.swish3]
    -F : next param is invindex format (ks, lucy, xapian, native, or dbi) [native]
    -h : print this usage statement
    -H : "Result Header Output": verbosity (0 to 9)  [1].
    #-k : Print words starting with a given char.
    -L : Limit results to a range of property values
    -m : the maximum number of results to return [defaults to all results]
    -n : query parser special NULL term
    #-P : next param is Phrase delimiter.
    #-p : include these document properties in the output "prop1 prop2 ..."
    -R : next param is Rank Scheme number (0 to 1)  [0].
    -s : sort by these document properties in the output "prop1 prop2 ..."
    #-T : Trace options ('-T help' for info)
    #-t : tags to search in - specify as a string
    #    "HBthec" - in Head|Body|title|header|emphasized|comments
    -V : prints the current version
    -v : indexing verbosity level (0 to 3) [-v 1]
    -w : search for words "word1 word2 ..."
    -x : "Extended Output Format": Specify the output format.

      swish3 version : $VERSION
SWISH::Prog::VERSION : $SWISH::Prog::VERSION
                docs : http://swish-e.org/swish3/

};

my $Opt = {
    aggregator_opts    => {},
    begin              => 0,
    config             => '',
    Debug              => 0,
    doc_filter         => undef,
    Expected           => undef,
    extended_output    => undef,
    folder             => undef,
    Format             => 'native',
    Headers            => 1,
    help               => 0,
    input              => 0,
    Limit              => [],
    links              => 0,
    lucy_highlightable => 0,
    max                => undef,
    Merge              => undef,
    newer_than         => undef,
    null_term          => undef,
    query              => '',
    sort_order         => '',
    Source             => 'fs',
    test_mode          => $ENV{SWISH_TEST} || 0,
    verbose            => undef,
    Version            => 0,
    Warnings           => 2,
};

my %allopts = (
    'aggregator|Source=s' => 'aggregator type',
    'aggregator_opts=s'   => 'aggregator options',
    'begin=i'             => 'begin results [0]',
    'config=s'            => 'config file',
    'debug|Debug:i'       => 'debugging',
    'doc_filter=s'        => 'document filter sub ref',
    'Expected=i'          => 'expected total docs (generates progress bar)',
    'extended_output|x=s' => 'extended output format string',
    'folder=s'            => 'invindex dir',
    'Format=s'            => 'indexer type (native, ks, lucy, xapian, dbi)',
    'Headers=i'           => 'Header output verbosity',
    'help'                => 'print usage',
    'input'               => 'indexing mode',
    'Limit=s'             => 'Limit to range of values',
    'links'               => 'follow symbolic links when indexing',
    'lucy_highlightable'  => 'set highlightable_fields for Lucy indexer',
    'max=i'               => 'max results [all]',
    'Merge'               => 'merge 2 or more indexes',
    'newer_than|N:s'      => 'index only documents newer than',
    'null_term|n'         => 'NULL query parser term',
    'query|words=s'       => 'search query',
    'RankScheme=i'        => 'ranking algorithm (native only)',
    'sort_order=s'        => 'result sort order',
    'test_mode'           => 'set with SWISH_TEST env var',
    'verbose:i'           => 'be verbose',
    'Version'             => 'print Version',
    'Warnings=i'          => 'print libxml2 warnings',
);

my %native_only_opts = ( 'RankScheme' => 1, );

GetOptions( $Opt, keys %allopts ) or die $USAGE;

if ( $Opt->{Version} ) {
    print "$0 $VERSION\n";
    exit;
}
if ( $Opt->{help} ) {
    print $USAGE;
    exit;
}

for my $opt ( keys %$Opt ) {
    if ( lc( $Opt->{Format} ) ne 'native'
        and exists $native_only_opts{$opt} )
    {
        ( my $letter = $opt ) =~ s/^(.).+/$1/;
        print "-$letter (--$opt) is a native-only format option\n";
        exit 1;
    }
}

#croak $USAGE unless @ARGV;

if ( $Opt->{input} ) {
    $Opt->{indexer} = $Opt->{Format};
}
$Opt->{invindex} = $Opt->{folder};

$Opt->{debug} and Data::Dump::dump $Opt;

if ( defined $Opt->{verbose} and $Opt->{verbose} == 0 ) {
    $Opt->{verbose} = 1;
}

if ( !exists $ENV{SWISH_WARNINGS} ) {
    $ENV{SWISH_WARNINGS}        = $Opt->{Warnings};
    $ENV{SWISH_PARSER_WARNINGS} = $Opt->{Warnings};
}

my %prog_can = ( highlightable_fields => $Opt->{lucy_highlightable}, );

for my $opt ( keys %$Opt ) {
    if ( SWISH::Prog->can($opt) ) {
        $prog_can{$opt} = $Opt->{$opt};
    }
    elsif ( $opt eq 'doc_filter' ) {
        $prog_can{filter} = $Opt->{$opt};
    }
}
if ( $Opt->{input} ) {
    my $prog = SWISH::Prog->new(%prog_can);
    if ( defined $Opt->{newer_than} ) {

        if ( !length $Opt->{newer_than} ) {
            $Opt->{newer_than}
                = $prog->indexer->invindex->path->file('swish_last_start');
        }

        # if it's a file, stat it,
        # otherwise convert to timestamp
        my $ts;
        my $dt = parse_date( $Opt->{newer_than} );
        if ( !defined $dt ) {
            my $stat = [ stat( $Opt->{newer_than} ) ];
            if ( !defined $stat->[9] ) {
                croak
                    "-N option must be a valid date string or a readable file: $!";
            }
            $ts = $stat->[9];
        }
        else {
            $ts = $dt->epoch;
        }
        $prog->aggregator->set_ok_if_newer_than($ts);
        printf "Skipping documents older than %s\n", scalar localtime($ts);

    }
    if ( $Opt->{links} and $Opt->{Source} eq 'fs' ) {
        $prog->aggregator->config->FollowSymLinks(1);
    }
    if ( $Opt->{Expected} ) {
        $prog->aggregator->progress( progress_bar( $Opt->{Expected} ) );
    }

    # this seems to DWIM and quiets a warning from SWISH::3
    $prog->aggregator->set_parser_from_type(1);

    $prog->indexer->invindex->path->mkpath( $Opt->{debug} );
    my $start    = time();
    my $num_docs = $prog->index(@ARGV);
    my $end      = time();
    my $elapsed  = $end - $start;
    printf( "%d documents in %s\n", ( $num_docs || 0 ), secs2hms($elapsed) );
}
elsif ( $Opt->{query} ) {
    my $invindex = SWISH::Prog::InvIndex->new( path => $Opt->{invindex} );
    my $meta     = $invindex->meta;
    my $format   = $meta->Index->{Format};
    my $sclass   = "SWISH::Prog::${format}::Searcher";
    eval "require $sclass";
    croak $@ if $@;
    my %qp_config = (
        dialect          => $format,
        query_class_opts => { debug => $Opt->{debug} }
    );
    if ( $Opt->{null_term} ) {
        $qp_config{null_term} = 'NULL';
    }
    my $searcher = $sclass->new(
        invindex  => $invindex->path . '',
        qp_config => \%qp_config,
        debug     => $Opt->{debug},
    );
    my $start_time = Time::HiRes::time();
    my $results;
    eval {
        $results = $searcher->search(
            to_utf8( $Opt->{query} ),
            {   start       => $Opt->{begin},
                max         => $Opt->{max},
                limit       => parse_limits( $Opt->{Limit} ),
                rank_scheme => $Opt->{RankScheme},
                order       => $Opt->{sort_order},
            }
        );
    };

    if ($@) {
        my $errmsg = "$@";
        $errmsg =~ s/ at \/[\w\/\.]+ line \d+\.?.*$//s;
        die "Error: $errmsg\n";
    }

    my $search_time = Time::HiRes::time() - $start_time;

    if ( $Opt->{Headers} ) {
        printf( "# swish3 version %s\n", $VERSION );
        printf( "# Format: %s\n",        $format );
        printf( "# Query: %s\n",         to_utf8( $Opt->{query} ) );
        printf( "# Hits: %d\n",          $results->hits );
        printf( "# Search time: %.4f\n", $search_time );
    }

    if ( $Opt->{Headers} > 1 ) {
        printf( "# Parsed Query: %s\n", $results->query );
    }

    my ( $output_format, $output_format_str );

    if ( $Opt->{extended_output} ) {
        my @props;
        my $default_properties = SWISH::3::SWISH_DOC_PROP_MAP();
        while ( $Opt->{extended_output} =~ m/<(.+?)>/g ) {
            my $p = $1;
            if (    !exists $meta->PropertyNames->{$p}
                and !exists $default_properties->{$p}
                and $p ne 'swishtitle'
                and $p ne 'swishdescription'
                and $p ne 'swishrank' )
            {
                die "Invalid PropertyName: $p\n";
            }
            else {
                push @props, $p;
            }
        }
        $output_format_str = $Opt->{extended_output};
        for my $prop (@props) {
            $output_format_str =~ s/<$prop>/\%s/g;    # TODO ints and dates
        }

        # make escaped chars work
        $output_format_str =~ s/\\n/\n/g;
        $output_format_str =~ s/\\t/\t/g;
        $output_format_str =~ s/\\r/\r/g;

        $output_format = \@props;

        #warn "str: $output_format_str\n";
        #warn dump $output_format;
    }

    my $counter = 0;
    while ( my $result = $results->next ) {
        if ($output_format) {
            my @res;
            for my $prop (@$output_format) {
                my $val;
                if ( $prop eq 'swishrank' ) {
                    $val = $result->score;
                }
                else {
                    $val = $result->get_property($prop);
                }
                $val = '' unless defined $val;
                $val =~ s/\003/\\x{03}/g;
                push( @res, to_utf8($val) );
            }
            printf( $output_format_str, @res );
        }
        else {
            printf( qq{%4d %s "%s"\n},
                $result->score, $result->uri, $result->title );
        }
        if ( $Opt->{max} ) {
            last if ++$counter >= $Opt->{max};
        }
    }
    print ".\n";
}
elsif ( $Opt->{Merge} ) {
    my @indexes = @ARGV;
    my $num     = scalar(@indexes);
    if ( $num < 2 ) {
        die "-M requires minimum of 2 arguments\n";
    }
    if ( $Opt->{Format} eq 'native' ) {
        require SWISH::Prog::Native::Indexer;
        my $target_invindex = pop(@indexes);
        my $indexer         = SWISH::Prog::Native::Indexer->new(
            invindex => $target_invindex,
            debug    => $Opt->{debug},
            verbose  => $Opt->{verbose},
        );
        if ( !$indexer->invindex->meta ) {
            die "$target_invindex missing swish.xml header file";
        }
        $indexer->merge(
            map { SWISH::Prog::Native::InvIndex->new( path => $_ ) }
                @indexes );
    }
    else {
        die "InvIndex Format $Opt->{Format} not supported for -M option\n";
    }
}
elsif ( $Opt->{Version} ) {
    print "$0 $VERSION\n";
}
else {
    print $USAGE;
}
exit;

sub secs2hms {
    my $secs  = shift || 0;
    my $hours = int( $secs / 3600 );
    my $rm    = $secs % 3600;
    my $min   = int( $rm / 60 );
    my $sec   = $rm % 60;
    return sprintf( "%02d:%02d:%02d", $hours, $min, $sec );
}

sub parse_limits {
    my $limits = shift or return;
    if ( !@$limits ) {
        return $limits;
    }
    my @parsed;
    for my $lim (@$limits) {
        push @parsed, [ split( /\s+/, $lim ) ];
    }
    return \@parsed;

}

sub progress_bar {
    require Term::ProgressBar;
    my $total = shift;
    my $tpb   = Term::ProgressBar->new(
        {   ETA   => 'linear',
            name  => 'swish3',
            count => $total,
        }
    );
    return $tpb;
}

__END__

=pod

=head1 NAME

swish3 - SWISH::Prog-based Swish3 implementation

=head1 SYNOPSIS

    swish3 [-E N] [-i dir file ... ] [--doc_filter filter] [-S aggregator] [-c file] [-f invindex] [-l] \
           [-v (num)]
    swish3 -w word1 word2 ... [-f file1 file2 ...] \
          [-P phrase_delimiter] [-p prop1 ...] [-s sortprop1 [asc|desc] ...] \
          [-m num] [-t str] [-H (num)] [-x output_format] \
          [-R rank_scheme] [-L prop low high]
    swish3 -k (char|*) [-f invindex1 invindex2 ...]
    swish3 -M invindex1 invindex2 ... outputfile
    swish3 -N path/to/compare/file or date
    swish3 -V
    swish3 -D n

 NOTE THAT SOME OPTIONS ARE NOT YET SUPPORTED.
 Type:
  % swish3 -h

=head1 DESCRIPTION

B<swish3> is one example of implementing the Swish3 API. B<swish3>
is written in Perl using the SWISH::Prog framework.

Type:

 % swish3 -h

for a complete list of options.

=head1 AUTHOR

Peter Karman, E<lt>perl@peknet.comE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright 2009-2010 by Peter Karman

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=head1 SEE ALSO

http://swish-e.org/swish3/

=cut