#!/usr/bin/perl
#
# SWISH::Prog-based Swish3 implementation
#
use strict;
use warnings;
use Carp;
use SWISH::Prog;
use SWISH::3;
use Getopt::Long qw(:config no_ignore_case);
use Data::Dump qw( dump );
use Rose::DateTime::Util qw(parse_date);
use Time::HiRes;
use Search::Tools::UTF8;
binmode STDERR, ':utf8';
binmode STDOUT, ':utf8';
my $VERSION = '3.0.16';
my $USAGE = qq{$0
usage:
swish3 [-E N] [-i dir file ... ] [-S aggregator] [-c file] [-f invindex] [-l] [-v (num)]
swish3 -w word1 word2 ... [-f file1 file2 ...] \
[-P phrase_delimiter] [-p prop1 ...] [-s sortprop1 [asc|desc] ...] \
[-m num] [-t str] [-d delim] [-H (num)] [-x output_format] \
[-R rank_scheme] [-L prop low high]
swish3 -k (char|*) [-f invindex1 invindex2 ...]
swish3 -M invindex1 ... outputfile
swish3 -N path/to/compare/file or date
swish3 -V
swish3 -h
swish3 -D n
options: defaults are in brackets
# commented options are not yet supported
indexing options:
--aggregator_opts : name=value
-c : configuration file
-D : Debug mode
--doc_filter : doc_filter
-E : next param is total expected files (generates progress bar)
-f : invindex dir to create or search from [index.swish3]
-F : next param is invindex format (ks, lucy, xapian, native, or dbi) [native]
-i : create an index from the specified files
for "-S fs" - specify a list of files or directories
for "-S spider" - specify a list of URLs
-h : print this usage statement
-l : follow symbolic links when indexing
-M : merges index files
-N : index only files with a modification date newer than path or date
if no argument, defaults to [indexdir/swish_last_start]
-S : specify which aggregator to use.
Valid options are:
"fs" - local files in your File System
"spider" - web site files using a web crawler
#"prog" - use the program API
The default value is: "fs"
#-T : Trace options ('-T help' for info)
-v : indexing verbosity level (0 to 3) [-v 1]
-W : next param is ParserWarnLevel [-W 2]
search options:
-b : begin results at this number
#-d : next param is delimiter.
#-E : Append errors to file specified, or stderr if file not specified.
#-e : "Economic Mode": The index proccess uses less RAM.
-f : invindex dir to create or search from [index.swish3]
-F : next param is invindex format (ks, lucy, xapian, native, or dbi) [native]
-h : print this usage statement
-H : "Result Header Output": verbosity (0 to 9) [1].
#-k : Print words starting with a given char.
-L : Limit results to a range of property values
-m : the maximum number of results to return [defaults to all results]
-n : query parser special NULL term
#-P : next param is Phrase delimiter.
#-p : include these document properties in the output "prop1 prop2 ..."
-R : next param is Rank Scheme number (0 to 1) [0].
-s : sort by these document properties in the output "prop1 prop2 ..."
#-T : Trace options ('-T help' for info)
#-t : tags to search in - specify as a string
# "HBthec" - in Head|Body|title|header|emphasized|comments
-V : prints the current version
-v : indexing verbosity level (0 to 3) [-v 1]
-w : search for words "word1 word2 ..."
-x : "Extended Output Format": Specify the output format.
swish3 version : $VERSION
SWISH::Prog::VERSION : $SWISH::Prog::VERSION
docs : http://swish-e.org/swish3/
};
my $Opt = {
aggregator_opts => {},
begin => 0,
config => '',
Debug => 0,
doc_filter => undef,
Expected => undef,
extended_output => undef,
folder => undef,
Format => 'native',
Headers => 1,
help => 0,
input => 0,
Limit => [],
links => 0,
lucy_highlightable => 0,
max => undef,
Merge => undef,
newer_than => undef,
null_term => undef,
query => '',
sort_order => '',
Source => 'fs',
test_mode => $ENV{SWISH_TEST} || 0,
verbose => undef,
Version => 0,
Warnings => 2,
};
my %allopts = (
'aggregator|Source=s' => 'aggregator type',
'aggregator_opts=s' => 'aggregator options',
'begin=i' => 'begin results [0]',
'config=s' => 'config file',
'debug|Debug:i' => 'debugging',
'doc_filter=s' => 'document filter sub ref',
'Expected=i' => 'expected total docs (generates progress bar)',
'extended_output|x=s' => 'extended output format string',
'folder=s' => 'invindex dir',
'Format=s' => 'indexer type (native, ks, lucy, xapian, dbi)',
'Headers=i' => 'Header output verbosity',
'help' => 'print usage',
'input' => 'indexing mode',
'Limit=s' => 'Limit to range of values',
'links' => 'follow symbolic links when indexing',
'lucy_highlightable' => 'set highlightable_fields for Lucy indexer',
'max=i' => 'max results [all]',
'Merge' => 'merge 2 or more indexes',
'newer_than|N:s' => 'index only documents newer than',
'null_term|n' => 'NULL query parser term',
'query|words=s' => 'search query',
'RankScheme=i' => 'ranking algorithm (native only)',
'sort_order=s' => 'result sort order',
'test_mode' => 'set with SWISH_TEST env var',
'verbose:i' => 'be verbose',
'Version' => 'print Version',
'Warnings=i' => 'print libxml2 warnings',
);
my %native_only_opts = ( 'RankScheme' => 1, );
GetOptions( $Opt, keys %allopts ) or die $USAGE;
if ( $Opt->{Version} ) {
print "$0 $VERSION\n";
exit;
}
if ( $Opt->{help} ) {
print $USAGE;
exit;
}
for my $opt ( keys %$Opt ) {
if ( lc( $Opt->{Format} ) ne 'native'
and exists $native_only_opts{$opt} )
{
( my $letter = $opt ) =~ s/^(.).+/$1/;
print "-$letter (--$opt) is a native-only format option\n";
exit 1;
}
}
#croak $USAGE unless @ARGV;
if ( $Opt->{input} ) {
$Opt->{indexer} = $Opt->{Format};
}
$Opt->{invindex} = $Opt->{folder};
$Opt->{debug} and Data::Dump::dump $Opt;
if ( defined $Opt->{verbose} and $Opt->{verbose} == 0 ) {
$Opt->{verbose} = 1;
}
if ( !exists $ENV{SWISH_WARNINGS} ) {
$ENV{SWISH_WARNINGS} = $Opt->{Warnings};
$ENV{SWISH_PARSER_WARNINGS} = $Opt->{Warnings};
}
my %prog_can = ( highlightable_fields => $Opt->{lucy_highlightable}, );
for my $opt ( keys %$Opt ) {
if ( SWISH::Prog->can($opt) ) {
$prog_can{$opt} = $Opt->{$opt};
}
elsif ( $opt eq 'doc_filter' ) {
$prog_can{filter} = $Opt->{$opt};
}
}
if ( $Opt->{input} ) {
my $prog = SWISH::Prog->new(%prog_can);
if ( defined $Opt->{newer_than} ) {
if ( !length $Opt->{newer_than} ) {
$Opt->{newer_than}
= $prog->indexer->invindex->path->file('swish_last_start');
}
# if it's a file, stat it,
# otherwise convert to timestamp
my $ts;
my $dt = parse_date( $Opt->{newer_than} );
if ( !defined $dt ) {
my $stat = [ stat( $Opt->{newer_than} ) ];
if ( !defined $stat->[9] ) {
croak
"-N option must be a valid date string or a readable file: $!";
}
$ts = $stat->[9];
}
else {
$ts = $dt->epoch;
}
$prog->aggregator->set_ok_if_newer_than($ts);
printf "Skipping documents older than %s\n", scalar localtime($ts);
}
if ( $Opt->{links} and $Opt->{Source} eq 'fs' ) {
$prog->aggregator->config->FollowSymLinks(1);
}
if ( $Opt->{Expected} ) {
$prog->aggregator->progress( progress_bar( $Opt->{Expected} ) );
}
# this seems to DWIM and quiets a warning from SWISH::3
$prog->aggregator->set_parser_from_type(1);
$prog->indexer->invindex->path->mkpath( $Opt->{debug} );
my $start = time();
my $num_docs = $prog->index(@ARGV);
my $end = time();
my $elapsed = $end - $start;
printf( "%d documents in %s\n", ( $num_docs || 0 ), secs2hms($elapsed) );
}
elsif ( $Opt->{query} ) {
my $invindex = SWISH::Prog::InvIndex->new( path => $Opt->{invindex} );
my $meta = $invindex->meta;
my $format = $meta->Index->{Format};
my $sclass = "SWISH::Prog::${format}::Searcher";
eval "require $sclass";
croak $@ if $@;
my %qp_config = (
dialect => $format,
query_class_opts => { debug => $Opt->{debug} }
);
if ( $Opt->{null_term} ) {
$qp_config{null_term} = 'NULL';
}
my $searcher = $sclass->new(
invindex => $invindex->path . '',
qp_config => \%qp_config,
debug => $Opt->{debug},
);
my $start_time = Time::HiRes::time();
my $results;
eval {
$results = $searcher->search(
to_utf8( $Opt->{query} ),
{ start => $Opt->{begin},
max => $Opt->{max},
limit => parse_limits( $Opt->{Limit} ),
rank_scheme => $Opt->{RankScheme},
order => $Opt->{sort_order},
}
);
};
if ($@) {
my $errmsg = "$@";
$errmsg =~ s/ at \/[\w\/\.]+ line \d+\.?.*$//s;
die "Error: $errmsg\n";
}
my $search_time = Time::HiRes::time() - $start_time;
if ( $Opt->{Headers} ) {
printf( "# swish3 version %s\n", $VERSION );
printf( "# Format: %s\n", $format );
printf( "# Query: %s\n", to_utf8( $Opt->{query} ) );
printf( "# Hits: %d\n", $results->hits );
printf( "# Search time: %.4f\n", $search_time );
}
if ( $Opt->{Headers} > 1 ) {
printf( "# Parsed Query: %s\n", $results->query );
}
my ( $output_format, $output_format_str );
if ( $Opt->{extended_output} ) {
my @props;
my $default_properties = SWISH::3::SWISH_DOC_PROP_MAP();
while ( $Opt->{extended_output} =~ m/<(.+?)>/g ) {
my $p = $1;
if ( !exists $meta->PropertyNames->{$p}
and !exists $default_properties->{$p}
and $p ne 'swishtitle'
and $p ne 'swishdescription'
and $p ne 'swishrank' )
{
die "Invalid PropertyName: $p\n";
}
else {
push @props, $p;
}
}
$output_format_str = $Opt->{extended_output};
for my $prop (@props) {
$output_format_str =~ s/<$prop>/\%s/g; # TODO ints and dates
}
# make escaped chars work
$output_format_str =~ s/\\n/\n/g;
$output_format_str =~ s/\\t/\t/g;
$output_format_str =~ s/\\r/\r/g;
$output_format = \@props;
#warn "str: $output_format_str\n";
#warn dump $output_format;
}
my $counter = 0;
while ( my $result = $results->next ) {
if ($output_format) {
my @res;
for my $prop (@$output_format) {
my $val;
if ( $prop eq 'swishrank' ) {
$val = $result->score;
}
else {
$val = $result->get_property($prop);
}
$val = '' unless defined $val;
$val =~ s/\003/\\x{03}/g;
push( @res, to_utf8($val) );
}
printf( $output_format_str, @res );
}
else {
printf( qq{%4d %s "%s"\n},
$result->score, $result->uri, $result->title );
}
if ( $Opt->{max} ) {
last if ++$counter >= $Opt->{max};
}
}
print ".\n";
}
elsif ( $Opt->{Merge} ) {
my @indexes = @ARGV;
my $num = scalar(@indexes);
if ( $num < 2 ) {
die "-M requires minimum of 2 arguments\n";
}
if ( $Opt->{Format} eq 'native' ) {
require SWISH::Prog::Native::Indexer;
my $target_invindex = pop(@indexes);
my $indexer = SWISH::Prog::Native::Indexer->new(
invindex => $target_invindex,
debug => $Opt->{debug},
verbose => $Opt->{verbose},
);
if ( !$indexer->invindex->meta ) {
die "$target_invindex missing swish.xml header file";
}
$indexer->merge(
map { SWISH::Prog::Native::InvIndex->new( path => $_ ) }
@indexes );
}
else {
die "InvIndex Format $Opt->{Format} not supported for -M option\n";
}
}
elsif ( $Opt->{Version} ) {
print "$0 $VERSION\n";
}
else {
print $USAGE;
}
exit;
sub secs2hms {
my $secs = shift || 0;
my $hours = int( $secs / 3600 );
my $rm = $secs % 3600;
my $min = int( $rm / 60 );
my $sec = $rm % 60;
return sprintf( "%02d:%02d:%02d", $hours, $min, $sec );
}
sub parse_limits {
my $limits = shift or return;
if ( !@$limits ) {
return $limits;
}
my @parsed;
for my $lim (@$limits) {
push @parsed, [ split( /\s+/, $lim ) ];
}
return \@parsed;
}
sub progress_bar {
require Term::ProgressBar;
my $total = shift;
my $tpb = Term::ProgressBar->new(
{ ETA => 'linear',
name => 'swish3',
count => $total,
}
);
return $tpb;
}
__END__
=pod
=head1 NAME
swish3 - SWISH::Prog-based Swish3 implementation
=head1 SYNOPSIS
swish3 [-E N] [-i dir file ... ] [--doc_filter filter] [-S aggregator] [-c file] [-f invindex] [-l] \
[-v (num)]
swish3 -w word1 word2 ... [-f file1 file2 ...] \
[-P phrase_delimiter] [-p prop1 ...] [-s sortprop1 [asc|desc] ...] \
[-m num] [-t str] [-H (num)] [-x output_format] \
[-R rank_scheme] [-L prop low high]
swish3 -k (char|*) [-f invindex1 invindex2 ...]
swish3 -M invindex1 invindex2 ... outputfile
swish3 -N path/to/compare/file or date
swish3 -V
swish3 -D n
NOTE THAT SOME OPTIONS ARE NOT YET SUPPORTED.
Type:
% swish3 -h
=head1 DESCRIPTION
B<swish3> is one example of implementing the Swish3 API. B<swish3>
is written in Perl using the SWISH::Prog framework.
Type:
% swish3 -h
for a complete list of options.
=head1 AUTHOR
Peter Karman, E<lt>perl@peknet.comE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright 2009-2010 by Peter Karman
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=head1 SEE ALSO
http://swish-e.org/swish3/
=cut