lib/Search/Lemur.pm - metacpan.org

package Search::Lemur;

use warnings;
use strict;
use Carp qw( carp ); 

use Search::Lemur::Result;
use Search::Lemur::ResultItem;
use Search::Lemur::Database;

use LWP;
use Data::Dumper;

use vars qw( $VERSION );

=head1 NAME

Lemur - class to query a Lemur server, and parse the results

=head1 VERSION

Version 1.00

=cut

our $VERSION = '1.00';

=head1 SYNOPSYS

 use Search::Lemur;

 my $lem = Search::Lemur->new("http://url/to/lemur.cgi");

 # run some queries, and get back an array of results
 # a query with a single term:
 my @results1 = $lem->query("encryption");
 # a query with two terms:
 my @results2 = $lem->query("encryption MD5");

 # get corpus term frequency of 'MD5':
 my $md5ctf = $results2[1]->ctf();

=head1 DESCRIPTION

This module will make it easy to interact with a Lemur
Toolkit for Language Modeling and Information Retrieval 
server for information retreival exercises.  For more
information on Lemur, see L<http://www.lemurproject.org/>.  

This module takes care of all parsing of responses from
the server. You can just pass a query as a 
space-separated list of terms, and the module will give
you back an array of C<result> objects.

=cut


=head2 Main Methods

=over 2

=item new($url)

Create a new Lemur object, connecting to the given Lemur server.
The C<$url> should be a full URL, ending in something like 'lemur.cgi'.

=cut

sub new {
    my $class = shift;
    my $url;
    if (@_) { $url = shift; 
    } else { return undef; }
    my $self = { baseurl => $url,
                 db => 0,
                 n => undef,
                 fullurl => undef };
    bless $self, $class;
    $self->{fullurl} = $self->_makeurl();
    return $self;
}

=item url()

Return the URL of the Lemur server

=cut

sub url {
    my $self = shift;
    return $self->{baseurl};
}

=item listdb()

Get some information about the databases available

Returns an array of Lemur::Database objects.

=cut

sub listdb {
    my $self = shift;
    $self->_makeurl();
    my $url = $self->{fullurl} . "&d=?";
    my $result = $self->_strip($url);
    return $self->_makedbs($result);
}

=item d([num])

Set the database number to query.  This will specify the 
database number instead of just using the default databse 0.

If the C<num> is not specified, the the current database is returned.

=cut

sub d {
    my $self = shift;
    if (@_) { $self->{d} = shift; $self->_makeurl(); }
    return $self->{d};
}

    

=item v(string)

Make a query to the Lemur server.  The query should be a space-delimited 
list of query terms.  If the URL is has not been specified, this will die.

Be sure there is only one space between words, or something unexpected may 
happen.

Returns an array of results (See L<Lemur::result>).  There will
be a result for each query term.

=cut

# This method really just queries the server, and passes the response on to 
# &_parse(string). This was done to make testing easier, without having to 
# query a real server for testing.
sub v {
    my $self = shift;
    my $query = shift;
    $query =~ s/ +/ /g;
    croak("Something went wrong; I have no URL") unless $self->{baseurl};
    my @terms = split(/ +/, $query);
    my $url = $self->{fullurl};
    foreach my $term (@terms) {
        $url = "$url&v=$term";
    }
    return $self->_parse([$query, $self->_strip($url)]);
}

=item m(string)

Returns the lexicalized (stopped & stemmed) version of the given
word.  This is affected by weather or not the current database
is stemmed and/or stopworded.  Basically, this is the real word 
you will end up searching for.

Returns a string.

=cut

sub m {
    my $self = shift;
    my $word = shift;
    my $url = $self->{fullurl} . "&m=$word";
    my $return = $self->_strip($url);
    if ($return eq "[OOV]") { $return = ""; }
    return $return;
}

# parse information about available databases into an array of 
# Search::Lemur::Database objects
#
# string -> arrayref
sub _makedbs {
    my $self = shift;
    my $input = shift;
    my @input = split(/\n/, $input);
    my @return;
    my ($num, $title, $stop, $stem, $numdocs, 
        $numterms, $numuniq, $avgdoclen);
    while (scalar(@input) >= 1){
        my $line = shift(@input);
        if ($line =~ m/(\d*):  ([\w|\d|\s]*) (NOSTOP|STOP) (NOSTEMM|STEMM);/){
            $num = $1;
            $title = $2;
            $stop = ($3 eq "STOP") ? 1 : 0;
            $stem = ($4 eq "STEMM") ? 1 : 0;
        } elsif ($line =~ m/ NUM_DOCS = ?(\d*);/){
            $numdocs = $1;
        } elsif ($line =~ m/ NUM_UNIQUE_TERMS = ?(\d*);/){
            $numuniq = $1;
        } elsif ($line =~ m/ NUM_TERMS = ?(\d*);/){
            $numterms = $1;
        } elsif ($line =~ m/ AVE_DOCLEN = ?(\d*);/){
            $avgdoclen = $1;
        } elsif ($line =~ m/<BR>/){
            my $db = Search::Lemur::Database->_new($num, $title, $stop, 
                $stem, $numdocs, $numterms, $numuniq, $avgdoclen);
            push @return, $db;
        }
    }
    return \@return;
}

# parse the result from the server
#
# Takes a reference to an array with two items:
#   - a string containing the query terms, separated by spaces
#   - a string containing the response
# 
# returns array of results
sub _parse {
    my $self = shift;
    my $inputref = shift;
    my @input = @$inputref;
    my @terms = split(/ /, $input[0]);
#    print Dumper($input[1]);
    my @response = split(/\D+/, $input[1]);
    shift(@response) if ($response[0] eq ""); #TODO Why am I doing this? this makes tests fail.
    my $numterms = scalar(@terms);

    my @return;
    
    # build a result object for each term
    foreach my $term (@terms) {
#        print Dumper(@response);
        my $ctf = shift(@response);
        my $df = shift(@response);
        my $result = Search::Lemur::Result->_new($term, $ctf, $df);
        # build a resultItem object for each document
        for (my $i = 0; $i < $df; $i++){
            my $docid = shift(@response);
            my $doclen = shift(@response);
            my $tf = shift(@response);
            my $resultItem = Search::Lemur::ResultItem->_new($docid, $doclen, $tf);
            $result->_add($resultItem);
        }
        push(@return, $result);
    }

    return \@return;
}

# build the full url to use for all queries
# This url consists of the base url (ending in lemur.cgi) plus
# d=n (specifies the database) and n=x (the number of results
# to return.  If either of these are undef, then they are left
# off, and the server is free to use its defaults
#
# the n value seems to only affect the q= query, and not the
# inverted list v= query.
#
# returns a string, and updates the fullurl instance variable 
sub _makeurl {
    my $self = shift;
    my $return = $self->url() . "?g=p";
    if ($self->{d}) { $return = $return . "&d=$self->{d}"; }
    if ($self->{n}) { $return = $return . "&n=$self->{n}"; }
    $self->{fullurl} = $return;
    return $return;
}
    
# strip_: make a request to the server, and strip out anything
# useless
#
# This will get the result from the server, and strip put any 
# html, etc that is not useful to the parser.
#
# string -> string
#
# takes in a url argument to fetch, and returns the stripped 
# result.
sub _strip {
    my $self = shift;
    my $url = shift;
#    print "$url\n\n";
    my $ua = LWP::UserAgent->new;
    $ua->agent("Lemur.pm/$VERSION");
    my $req = HTTP::Request->new(GET => $url);
    $req->content_type('application/x-www-form-urlencoded');
    $req->content('query=libwww-perl&mode=dist');
    # make request
    my $res = $ua->request($req);

    if ($res->is_success) {
        $res->content() =~ m/.*<BODY>\n\n((\s|\d|\n|\w|\[|\]|:|;|=|<|>)*?)\n<HR>/;
#        print $1 . "\n\n";
        return $1;
    }
    else {
        Carp::carp($res->status_line, "\n");
        return undef;
    }
}




=back

=head1 AUTHOR

Patrick Kaeding, C<< <pkaeding at cpan.org> >>

=head1 BUGS

Please report any bugs or feature requests to
C<bug-search-lemur at rt.cpan.org>, or through the web interface at
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Search-Lemur>.
I will be notified, and then you'll automatically be notified of progress on
your bug as I make changes.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Search::Lemur

You can also look for information at:

=over 4

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/Search-Lemur>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/Search-Lemur>

=item * RT: CPAN's request tracker

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Search-Lemur>

=item * Search CPAN

L<http://search.cpan.org/dist/Search-Lemur>

=back

=head1 ACKNOWLEDGEMENTS

=head1 COPYRIGHT & LICENSE

Copyright 2007 Patrick Kaeding, all rights reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of Search::Lemur
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)