The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# -*- perl -*-

#
# $Id: Htdig.pm,v 1.18 2006/12/01 10:12:56 cmuellermeta Exp $
# Author: Slaven Rezic
#
# Copyright (C) 2002 Slaven Rezic. All rights reserved.
# This package is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
# Mail: slaven@rezic.de
# WWW:  http://www.rezic.de/eserte/
#

package WE_Frontend::Indexer::Htdig;

use strict;
use vars qw($VERSION);
$VERSION = sprintf("%d.%02d", q$Revision: 1.18 $ =~ /(\d+)\.(\d+)/);

## XXX Maybe some day ...
# sub new {
#     my $class = shift;
#     my(%args) = @_;
#     my $self = {};
#     if ($args{-searchengine}) {
# 	WEsiteinfo::SearchEngine
#     }
# }

sub conf {
    my($conf, $lang) = @_;
    $conf =~ s/%\{lang\}/$lang/g;
    $conf;
}

sub conf_is_lang_dependent {
    my($conf) = @_;
    $conf =~ /%\{lang\}/ ? 1 : 0;
}

sub search {
    my(%args) = @_;
    my $words = delete $args{-words} || die "No words specified";
    my $conf  = delete $args{-conf};
    my $lang  = delete $args{-lang};
    my $query = delete $args{-query};
    my $debug = delete $args{-debug};
    my $https_hack = delete $args{-httpshack};
    my $transform  = delete $args{-transform};
    my $method = delete $args{-method};

    if (keys %args) { warn "Unknown arguments: " . join(", ", %args) } # XXXdie?

    local %ENV = %ENV;
    delete $ENV{REQUEST_METHOD}; # security barrier in htsearch
    # Location of htdig in the standard FreeBSD port, after a normal
    # unaltered install and a Debian 3.0 install. Sometimes $ENV{PATH} is
    # empty, so supply additional reasonable defaults.
    local $ENV{PATH} .= ":/usr/local/share/apache/cgi-bin:/opt/www/cgi-bin:/usr/lib/cgi-bin:/usr/bin:/bin";

    if ($debug) {
	warn "Current path is $ENV{PATH}";
    }

    require CGI;
    my $q = CGI::new($query);
    $q->param("words", $words);
    $q->param("format", "perl");
    $q->param("method", $method) if defined $method;

    my $conf_path = conf($conf, $lang) if defined $conf;
    my @htsearch_cmd = ("htsearch",
			(defined $conf ? ("-c", $conf_path) : ()),
			$q->query_string);
    open(HTSEARCH, "-|") or do {
	if ($debug) {
	    warn "Execute: @htsearch_cmd";
	}
	exec @htsearch_cmd;
	die "Can't execute htsearch command (@htsearch_cmd), PATH is $ENV{PATH}: $!";
    };

    # overread header
    while(<HTSEARCH>) {
	chomp;
	last if /^\r?$/;
    }

    # slurp generated perl data dump
    local $/ = undef;
    my $perlcode = <HTSEARCH>;
    #warn $perlcode;

    require Safe;
    my $cpt = Safe->new;
    my $obj = $cpt->reval($perlcode);

    my $nr = 0;
    die "Error while evaluating perl code result from htsearch call:\n@htsearch_cmd.\n" .
	"Code: " . join("\n", map { sprintf "%4d %s", ++$nr, $_ }
			split /\n/, $perlcode) . "\n" .
	"Error: $@\n" .
	(defined $conf ? "Configuration file is $conf_path\n" : "Using standard configuration file\n")
	    if !$obj;

    while(my($key,$val) = each %$obj) {
	if ($key =~ /(.*)_urlenc$/) {
	    my $real_key = $1;
	    $obj->{$real_key} = CGI::unescape($val);
	} elsif ($key eq 'list') {
	    for my $obj (@$val) {
		while(my($key,$val) = each %$obj) {
		    if ($key =~ /(.*)_urlenc$/) {
			$obj->{$1} = CGI::unescape($val);
		    }
		}
	    }
	}
    }

    my $parse_href = sub {
	my $href = shift;
	my($pageurl, $pagenumber);
	if (my($url, $querystring) = $href =~ /^(.*?)\?(.*)$/) {
	    my $q = CGI->new($querystring);
	    if (defined $lang && !defined $q->param("lang")) {
		$q->param("lang", $lang);
	    }
	    if (!defined $q->param("page")) {
		warn "Can't get page parameter from $href";
		$pagenumber = undef;
	    } else {
		$pagenumber = $q->param("page");
	    }
	    my $new_href = "$url?" . $q->query_string;
	    $pageurl = $new_href;
	} else {
	    warn "Can't parse URL $href";
	}
	($pageurl, $pagenumber);
    };

    if ($obj->{"pagelist"}) {
	my @pageurllist;
	my @pagenumberlist;
	my @hrefs = $obj->{"pagelist"} =~ m{href="(.*?)"}g;
	for my $href (@hrefs) {
	    my($pageurl, $pagenumber) = $parse_href->($href);
	    if (defined $pageurl) {
		push @pageurllist, $pageurl;
		push @pagenumberlist, $pagenumber;
	    }
	}
	# Add this page
	if ($pagenumberlist[0] != 1) {
	    unshift @pageurllist, undef; # undef means: this page
	    unshift @pagenumberlist, 1;
	} else {
	SEARCH: {
		for my $i (0 .. $#pagenumberlist) {
		    if ($i+1 != $pagenumberlist[$i]) {
			# current page is in the middle of the list
			splice @pagenumberlist, $i, 0, $i+1;
			splice @pageurllist, $i, 0, undef;
			last SEARCH;
		    }
		}
		# otherwise it's the last page
		push @pageurllist, undef;
		push @pagenumberlist, $#pagenumberlist+2;
	    }
	}
	$obj->{pageurllist}    = \@pageurllist;
	$obj->{pagenumberlist} = \@pagenumberlist;
    }

    for my $dir (qw(prev next)) {
	if ($obj->{$dir."page"}) {
	    my($href) = $obj->{$dir."page"} =~ m{href="(.*?)"};
	    my($pageurl, $pagenumber) = $parse_href->($href);
	    if (defined $pageurl) {
		$obj->{$dir."pageurl"} = $pageurl;
		$obj->{$dir."pagenumber"} = $pagenumber;
	    }
	}
    }

    if ($https_hack && $obj->{list}) {
	for my $hit (@{ $obj->{list} }) {
	    $hit->{url} =~ s{^http://}{https://};
	}
    }

    # words is documented, but not available?
    if (!exists $obj->{words}) {
	$obj->{words} = $words;
    }

    if ($transform) {
	$transform->($obj);
    }

    $obj;
}

sub generate_conf {
    my($c, %args) = @_;

    my $debug = $args{-debug};

    my $lang = $args{-lang};

    my $tpl  = $args{-htdigconftemplate} || $c->searchengine->htdigconftemplate;
    my $conf = $args{-htdigconf} || $c->searchengine->htdigconf;
    my $lang_conf = conf($conf, $lang);
    if (conf_is_lang_dependent($lang_conf)) {
	die "-lang should be supplied for language independent conf specification $conf";
    }

    if (!defined $tpl) {
	if ($debug) {
	    warn "No template config defined, we're done with $lang_conf.\n";
	}
	return $lang_conf;
    }

    my @dependents;
    if ($args{-dependents}) {
	@dependents = @{ $args{-dependents} };
    } else {
	(my $pkgfile = __PACKAGE__) =~ s{::}{/}g;
	# XXX what if module is named WEsiteinfo_project.pm and WEprojectinfo_project.pm?
	push @dependents,
	    $INC{"WEsiteinfo.pm"}, $INC{"WEprojectinfo.pm"},
	    $tpl,
	    $INC{"$pkgfile.pm"};
    }

    # is the configuration file current?
    my $conf_is_old = 0;
    if (!-e $lang_conf) {
	$conf_is_old = 1;
    } else {
	for my $dep (grep { defined $_ } @dependents) {
	    if (!-e $lang_conf || -M $dep < -M $lang_conf) {
		$conf_is_old = 1;
		last;
	    }
	}
    }

    if (!$conf_is_old) {
	warn "htdig config file $lang_conf is current, we're done.\n";
	return $lang_conf;
    }

    my $long_lang;
    if (defined $lang) {
	$long_lang = {en => "english",
		      de => "german",
		      it => "italian",
		      fr => "french",
		      kr => "korean",
		      ru => "russian",
		      es => "spanish",
		      pt => "portugese",
		      hu => "hungarian",
		     }->{$lang};
	warn "long_lang is not defined for $lang"
	    if !defined $long_lang;
    }

    # regenerate conf file
    require Template;
    # XXX Don't duplicate this --- already found in we_search.cgi and
    # we_redisys.cgi
    my $t = Template->new
	(ABSOLUTE => 1,
	 POST_CHOMP => 0,
	 INCLUDE_PATH => [$c->paths->site_templatebase,
			  $c->paths->we_templatebase,
			 ],
	 EVAL_PERL => 1,
	 PLUGIN_BASE => ["WE_" . $c->project->name . "::Plugin",
			 "WE_Frontend::Plugin"]
	);
    if ($debug) {
	warn "Create config file $lang_conf from $tpl.\n";
    }
    my $conf_header = <<EOF;
# DO NOT EDIT THIS FILE!
# Generated automatically by:
#   module: @{[ __PACKAGE__ ]}
#   user:   @{[ (getpwuid($<))[0] ]}
#   date:   @{[ scalar localtime ]}
EOF
    $t->process
	($tpl,
	 {c        => $c,
	  config   => $c, # for compatibility
	  lang     => $lang,
	  longlang => $long_lang,
	  # strip dash from args keys
	  args     => [map { (substr($_, 1) => $args{$_}) } keys(%args)],
	  conf_header => $conf_header,
	 }, $lang_conf)
	or die $t->error;

    return $lang_conf;
}

1;

__END__

=head1 NAME

WE_Frontend::Indexer::Htdig - interface to the htdig search engine

=head1 SYNOPSIS

    use WE_Frontend::Indexer::Htdig;
    my $results = WE_Frontend::Indexer::Htdig::search(-words => "word");

=head1 DESCRIPTION

This is an interface to the C<htdig> search engine. The result of the
C<search> function call is a perl hash reference containing the
results.

=head1 FUNCTIONS

=head2 search(%args)

Arguments are:

=over

=item -words

A string with the words to search. Multiple words are
space-separated. This argument is required.

=item -conf

Specify a different htdig configuration file, otherwise the default
C<htdig.conf> is used.

=item -lang

(Optional) Specify a language. The configuration parameter given by
conf may contain %{lang} placeholders which are substituted by the
value of this argument.

=item -debug

Output some diagnostics to stderr.

=item -httpshack

Set to a true value if operating on a https server. htdig does not
handle SSL, so a parallel http should be setup for the indexing. With
the https hack the URLs in the search result C<list> are translated at
template display time.

=back

The result is a hash reference with the following keys:

=over

=item logical_words

=item matches_per_page

=item max_stars

=item page

=item pages

=item list

Holds an array with the search results. See below.

=item nomatch

This variable is set to a true value if the search produces no
results. Also detectable by an empty result list.

=item pageurllist

A list of URLs for the 1 .. 10 result pages.

=item pagenumberlist

The corresponding numbers for the pageurllist. Please note that
perl/Template arrays start with index 0 (which would be page 1).

=item prevpageurl

=item nextpageurl

Hold the URLs for the previous resp. next result page.

=item prevpagenumber

=item nextpagenumber

Usually not needed: the number of the previous resp. next result page.
In fact you would label them "Prev"/"Next" or "E<lt>"/"E<gt>".

=item ...

=back

There are more keys. For a complete list refer to the htdig
documentation at L<http://www.htdig.org>, C<htsearch>, Templates. Note
that the original template variable names are converted to lowercase.

The value of C<list> is an array reference with the matches. Each
match is a hash reference with the following keys:

=over

=item url

The URL of the page. See also the C<-httpshack> option above.

=item title

The title of the page, as specified by the <title> html tag.

=item anchor

=item excerpt

The first lines of text in the document.

=item score

=item percent

=item modified

The date and time the document was last modified. See also the
documentation of the C<iso_8601> config variable in C<htdig.conf>.

=item ...

=back

The complete list is also in the htdig documentation at
L<http://www.htdig.org>, C<htsearch>, Templates.

=head1 CONFIGURATION FILES

It is best to just use the original C<conf/htdig.tpl.conf> file found
in the B<webeditor> distribution. The indexing program in B<webeditor>
will use the template file and fill it with the configuration found in
C<WEsiteinfo>. Please look also into htdig.txt in the webeditor/doc
directory for a first-time installation/configuration.

=head2 WEsiteinfo configuration:

To override the searchindexer path (default is "rundig" without a
path):

    $searchengine->searchindexer("/usr/local/bin/rundig");

To set the template htdig and target htdig configuration files (these
settings are highly recommended):

    $searchengine->htdigconftemplate($paths->uprootdir . "/conf/htdig.tpl.conf");
    $searchengine->htdigconf($paths->uprootdir . "/conf/htdig.%{lang}.conf");

where C<$paths> is the B<WEsiteinfo::Paths> object documented in
L<WE_Frontend::Info>. If the configuration file should not be language
dependent, then use

    $searchengine->htdigconf($paths->uprootdir . "/conf/htdig.conf");

instead.

=head2 Own htdig.conf

If you decide to make your own C<htdig.conf>, put at least the
following lines into the configuration file:

    template_map: Long long ${common_dir}/long.html \
                  Short short ${common_dir}/short.html \
                  Perl perl ${common_dir}/perl/match.pl
    template_name: perl
    search_results_header: ${common_dir}/perl/header.pl
    search_results_footer: ${common_dir}/perl/footer.pl
    nothing_found_file:    ${common_dir}/perl/nomatch.pl

C<${common_dir}/perl> should be a link to the directory
C<.../lib/WE_Frontend/Indexer/htdig_common>.

=head1 INSTALLING HTDIG

htdig is available e.g. from this location:
L<http://www.htdig.org/files/snapshots/htdig-3.2.0b5-20040404.tar.gz>.

To compile and install htdig from scratch, the following configure
line could be used to create a path layout similar to the RedHat one:

    sh configure --prefix=/usr --with-search-dir=/usr/share/htdig --with-image-dir=/usr/share/htdig --with-cgi-bin-dir=/usr/bin --with-config-dir=/etc --with-database-dir=/usr/share/htdig

=head1 CAVEATS

Many. Mind the permissions. Especially, rundig may use the default
database directory (C</usr/local/share/htdig/database> or such) as the
temporary directory for sorting, which will fail if the apache user
(usually C<nobody> or C<www>) has no permissions to write to this
directory. In this case change the C<TMPDIR> definition in rundir or
set appropriate write permissions.

=head1 AUTHOR

Slaven Rezic - slaven@rezic.de

=head1 SEE ALSO

L<htdig(1)>.

=cut