The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
use strict;
use warnings;
use 5.008_003;

package SWISH::3;

our $VERSION = '1.000014';
my $version = $VERSION;
$VERSION = eval $VERSION;    # numerify

# set by libswish3 in swish.c but that happens after %ENV has been
# initialized at Perl compile time.
$ENV{SWISH3} = 1;

use Carp;
use Data::Dump;

use base qw( Exporter );

use constant SWISH_DOC_FIELDS =>
    qw( mtime size encoding mime uri nwords ext parser );
use constant SWISH_TOKEN_FIELDS => qw( pos meta value context len );

# these numbers are assigned via enum in libswish3.h
# and so are too tedious to parse via Makefile.PL
# since they are typically only added-to, not a big deal
# to maintain manually here.
# we can't just assign to the constant value since the
# constants are not loaded until run time above via XSLoader.
use constant SWISH_DOC_FIELDS_MAP => {
    encoding    => 10,
    mime        => 8,
    description => 6,
    mtime       => 5,
    nwords      => 7,
    parser      => 9,
    size        => 4,
    title       => 3,
    uri         => 1,
};

# property name to docinfo attribute
use constant SWISH_DOC_PROP_MAP => {
    swishencoding     => 'encoding',
    swishmime         => 'mime',
    swishlastmodified => 'mtime',
    swishwordnum      => 'nwords',
    swishparser       => 'parser',
    swishdocsize      => 'size',
    swishdocpath      => 'uri'
};

# load the XS at runtime, since we need $version
require XSLoader;
XSLoader::load( __PACKAGE__, $version );

# init the memory counter as class method at start up
# and call debug in END block
SWISH::3->_setup;

END {

# NOTE This will give false positives if there is a Perl reference
# count attached to a SWISH::3 object when the program ends,
# as when in a closure or function.
# so only report when env var is on.
# this is a bug in Perl 5.8.x:
# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2007-12/msg01047.html

    if ( $ENV{SWISH_DEBUG_MEMORY} && SWISH::3->get_memcount ) {
        warn " ***WARNING*** possible memory leak ***WARNING***\n";
        SWISH::3->mem_debug();
    }
}

# our symbol table is populated with newCONSTSUB in Constants.xs
# directly from libswish3.h, so we just grep the symbol table.
my @constants = ( grep {m/^SWISH_/} keys %SWISH::3:: );

our %EXPORT_TAGS = ( 'constants' => [@constants], );
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'constants'} } );

# convenience accessors
*config   = \&get_config;
*analyzer = \&get_analyzer;
*regex    = \&get_regex;
*parser   = \&get_parser;

# alias debugging methods for all classes
*SWISH::3::Config::refcount   = \&refcount;
*SWISH::3::Analyzer::refcount = \&refcount;
*SWISH::3::WordList::refcount = \&refcount;
*SWISH::3::Word::refcount     = \&refcount;
*SWISH::3::Doc::refcount      = \&refcount;
*SWISH::3::Data::refcount     = \&refcount;
*SWISH::3::Property::refcount = \&refcount;
*SWISH::3::MetaName::refcount = \&refcount;

# another mnemonic alias
*SWISH::3::Config::merge = \&SWISH::3::Config::add;

sub new {
    my $class = shift;
    my %arg   = @_;
    my $self  = $class->_init;

    if ( $arg{config} ) {
        $self->get_config->add( $arg{config} );
    }

    # override defaults
    for my $param (qw( data_class parser_class config_class analyzer_class ))
    {
        my $method = 'set_' . $param;

        if ( exists $arg{$param} ) {

            #warn "$method";
            $self->$method( $arg{$param} );
        }
    }

    $arg{handler} ||= \&default_handler;

    $self->set_handler( $arg{handler} );

    # Lucy default regex -- should also match swish_tokenize() behaviour
    $arg{regex} ||= qr/\w+(?:[\x{2019}']\w+)*/;
    $self->set_regex( $arg{regex} );

    return $self;
}

sub parse {
    my $self = shift;
    my $what = shift
        or croak "parse requires filehandle, scalar ref or file path";
    if ( ref $what eq 'SCALAR' ) {
        return $self->parse_buffer($what);
    }
    elsif ( ref $what ) {
        return $self->parse_fh($what);
    }
    else {
        return $self->parse_file($what);
    }
}

sub dump {
    my $self = shift;
    if (@_) {
        $self->describe(@_);
        Data::Dump::dump(@_);
    }
    else {
        $self->describe($self);
        Data::Dump::dump($self);
    }
}

sub default_handler {
    my $data = shift;
    unless ( $ENV{SWISH_DEBUG} ) {
        warn "default handler called\n";
        return;
    }

    select(STDERR);
    print '~' x 80, "\n";

    my $props     = $data->properties;
    my $prop_hash = $data->config->get_properties;

    print "Properties\n";
    for my $p ( sort keys %$props ) {
        print " key: $p\n";
        my $prop_value = $props->{$p};
        print " value: " . Data::Dump::dump($prop_value) . "\n";
        my $prop = $prop_hash->get($p);
        printf( "    <%s type='%s'>%s</%s>\n",
            $prop->name, $prop->type, $data->property($p), $prop->name );
    }

    print "Doc\n";
    for my $d (SWISH_DOC_FIELDS) {
        printf( "%15s: %s\n", $d, $data->doc->$d );
    }

    print "TokenList\n";
    my $tokens = $data->tokens;
    while ( my $token = $tokens->next ) {
        print '-' x 50, "\n";
        for my $field (SWISH_TOKEN_FIELDS) {
            printf( "%15s: %s\n", $field, $token->$field );
        }
    }
}

{
    package    # hide from CPAN
        SWISH::3::xml2TiedHash;

    use overload (
        '%{}'    => sub { shift->_as_hash() },
        bool     => sub {1},
        fallback => 1,
    );

    sub _as_hash {
        my $self = shift;
        my %tmp  = ();
        for my $k ( @{ $self->keys } ) {
            my $v = $self->get($k);
            if ( ref $v ) {
                $tmp{$k} = $v->_as_hash();
            }
            else {
                $tmp{$k} = $v;
            }
        }
        return \%tmp;
    }

    sub TO_JSON {
        return shift->_as_hash;
    }

    sub FREEZE {
        return shift->_as_hash;
    }

}
{
    package    # hide
        SWISH::3::xml2Hash;
    our @ISA = ('SWISH::3::xml2TiedHash');
}
{
    package    # hide
        SWISH::3::MetaNameHash;
    our @ISA = ('SWISH::3::xml2TiedHash');
}
{
    package    # hide
        SWISH::3::PropertyHash;
    our @ISA = ('SWISH::3::xml2TiedHash');
}
{
    package    # hide
        SWISH::3::MetaName;
    our @ISA = ('SWISH::3::xml2TiedHash');

    sub _as_hash {
        my $self = shift;
        return { map { $_ => $self->$_ } qw( id name bias alias_for ) };
    }
}
{
    package    # hide
        SWISH::3::Property;
    our @ISA = ('SWISH::3::xml2TiedHash');

    sub _as_hash {
        my $self = shift;
        return { map { $_ => $self->$_ }
                qw( id name ignore_case type verbatim max sort presort alias_for )
        };
    }
}

1;

__END__

=head1 NAME

SWISH::3 - Perl interface to libswish3

=head1 SYNOPSIS

 use SWISH::3 qw(:constants);
 my $handler = sub {
    my $s3_data   = shift;
    my $props     = $s3_data->properties;
    my $prop_hash = $s3_data->config->get_properties;

    print "Properties\n";
    for my $p ( sort keys %$props ) {
        print " key: $p\n";
        my $prop = $prop_hash->get($p);
        printf( "    <%s type='%s'>%s</%s>\n",
            $prop->name, $prop->type, $s3_data->property($p), $prop->name );
    }    

    print "Doc\n";
    for my $d (SWISH_DOC_FIELDS) {
        printf( "%15s: %s\n", $d, $s3_data->doc->$d );
    }    

    print "TokenList\n";
    my $tokens = $s3_data->tokens;
    while ( my $token = $tokens->next ) {
        print '-' x 50, "\n";
        for my $field (SWISH_TOKEN_FIELDS) {
            printf( "%15s: %s\n", $field, $token->$field );
        }    
    }
 };
 my $swish3 = SWISH::3->new(
                config      => 'path/to/config.xml',
                handler     => $handler,
                regex       => qr/\w+(?:'\w+)*/,
                );
 $swish3->parse( 'path/to/file.xml' )
    or die "failed to parse file: " . $swish3->error;
 
 printf "libxml2 version %s\n", $swish3->xml2_version;
 printf "libswish3 version %s\n", $swish3->version;
 
 
=head1 DESCRIPTION

SWISH::3 is a Perl interface to the libswish3 C library.

=head1 CONSTANTS

All the C<SWISH_*> constants defined in libswish3.h are available
and can be optionally imported with the B<:constants> keyword.

 use SWISH::3 qw(:constants);

See the SWISH::3::Constants section below.

In addition, the SWISH::3 Perl class defines some Perl-only constants:

=over

=item SWISH_DOC_FIELDS

An array of method names that can be called on a SWISH::3::Doc object
in your handler method.

=item SWISH_TOKEN_FIELDS

An array of method names that can be called on a SWISH::3::Token object.

=item SWISH_DOC_FIELDS_MAP

A hashref of method names to id integer values. The integer values
are assigned in libswish3.h.

=item SWISH_DOC_PROP_MAP

A hashref of built-in property names to docinfo attribute names.
The values of SWISH_DOC_PROP_MAP are the keys of SWISH_DOC_FIELDS_MAP.

=back

=head1 FUNCTIONS

=head2 default_handler

The handler used if you do not specify one. By default is simply
prints the contents of SWISH::3::Data to stderr.

=head1 CLASS METHODS

=head2 new( I<args> )

I<args> should be an array of key/value pairs. See SYNOPSIS.

Returns a new SWISH::3 instance.

=head2 xml2_version

Returns the libxml2 version used by libswish3.

=head2 version

Returns the libswish3 version.

=head2 refcount( I<object> )

Returns the Perl reference count for I<object>.

=head2 wc_report( I<codepoint> )

Prints a isw* summary to stderr for I<codepoint>. I<codepoint>
should be a positive integer representing a Unicode codepoint.

This prints a report similar to the swish_isw.c example script.

=head2 slurp( I<filename> )

Returns the contents of I<filename> as a scalar string. May also
be called as an object method.

=head1 OBJECT METHODS

=head2 get_file_ext( I<filename> )

Returns file extension for I<filename>.

=head2 get_mime( I<filename> )

Returns the configured MIME type for I<filename> based on file
extension.

=head2 get_real_mime( I<filename> )

Returns the configured MIME type for I<filename>, ignoring
any C<.gz> extension. See L<looks_like_gz>.

=head2 looks_like_gz( I<filename> )

Returns true if I<filename> has a file extension indicating
it is gzip'd. Wraps the swish_fs_looks_like_gz() C function.

=head2 parse( I<filename_or_filehandle_or_string> )

Wrapper around parse_file(), parse_buffer() and parse_fh() that tries
to Do the Right Thing.

=head2 parse_file( I<filename> )

Calls the C function of the same name on I<filename>.

=head2 parse_buffer( I<str> )

Calls the C function of the same name on I<str>. B<Note> that
I<str> should contain the API headers.

=head2 parse_fh( I<filehandle> )

Calls the C function of the same name on I<filehandle>. B<Note> that
the stream pointed to by I<filehandle> should contain the API headers.
See L<SWISH::3::Headers>.

=head2 error

Returns the error message from the last call to parse(), parse_file()
parse_buffer() or parse_fh(). If there was no error on the last
call to one of those methods, returns undef.

=head2 set_config( I<swish_3_config> )

Set the Config object.

=head2 get_config

Returns SWISH::3::Config object.

=head2 config

Alias for get_config().

=head2 set_analyzer( I<swish_3_analyzer> )

Set the Analyzer object.

=head2 get_analyzer

Returns SWISH::3::Analyzer object.

=head2 analyzer

Alias for get_analyzer()

=head2 set_parser( I<swish_3_parser> )

Set the Parser object.

=head2 get_parser

Returns SWISH::3::Parser object.

=head2 parser

Alias for get_parser().

=head2 set_handler( \&handler )

Set the parser handler CODE ref.

=head2 get_handler

Returns a CODE ref for the handler.

=head2 set_data_class( I<class_name> )

Default I<class_name> is C<SWISH::3::Data>.

=head2 get_data_class

Returns class name.

=head2 set_parser_class( I<class_name> )

Default I<class_name> is C<SWISH::3::Parser>.

=head2 get_parser_class

Returns class name.

=head2 set_config_class( I<class_name> )

Default I<class_name> is C<SWISH::3::Config>.

=head2 get_config_class

Returns class name.

=head2 set_analyzer_class( I<class_name> )

Default I<class_name> is C<SWISH::3::Analyzer>.

=head2 get_analyzer_class

Returns class name.

=head2 set_regex( qr/\w+(?:'\w+)*/ )

Set the regex used in tokenize().

=head2 get_regex

Returns the regex used in tokenize().

=head2 regex

Alias for get_regex().

=head2 get_stash

Returns the SWISH::3::Stash object used internally by 
the SWISH::3 object. You typically do not need to access this object
as a user of SWISH::3, but if you are developing code that needs to
access objects within a I<handler> function, you can put it in the Stash
object and then retrieve it later.

Example:

 my $s3    = SWISH::3->new( handler => \&handler );
 my $stash = $s3->get_stash();
 $stash->set('my_indexer' => $indexer);
 
 # later..
 sub handler {
     my $data  = shift;
     my $indexer = $data->s3->get_stash->get('my_indexer');
     $indexer->add_doc( $data );
 }

=head2 tokenize( I<string> [, I<metaname>, I<context> ] )

Returns a SWISH::3::TokenIterator object representing I<string>.
The tokenizer uses the regex defined in set_regex().

=head2 tokenize_native( I<string> [, I<metaname>, I<context> ] )

Returns a SWISH::3::TokenIterator object representing I<string>.
The tokenizer uses the built-in libswish3 tokenizer, not a regex.

=head1 DEVELOPER METHODS

=head2 ref_cnt

Returns the internal reference count for the underlying C struct pointer.

=head2 debug([I<n>])

Get/set the internal debugging level.

=head2 describe( I<object> )

Like calling Devel::Peek::Dump on I<object>.

=head2 mem_debug

Calls the C function swish_memcount_debug().

=head2 get_memcount

Returns the global C malloc counter value.

=head2 dump

A wrapper around describe() and Data::Dump::dump().

=head1 SWISH::3::Analyzer

=head2 new( I<swish_3_config> )

Returns a new SWISH::3::Analyzer instance.

=head2 set_regex( qr/\w+/ )

Set the regex used in SWISH::3->tokenize().

=head2 get_regex

Returns a qr// regex object.

=head2 get_tokenize

Get the tokenize flag. Default is true.

=head2 set_tokenize( 0|1 )

Toggle the tokenize flag. Default is true (tokenize contents when
file is parsed).

=head1 SWISH::3::Config

=head2 set_default

=head2 set_properties

Not yet implemented.

=head2 get_properties

Returns SWISH::3::PropertyHash object.

=head2 set_metanames

Not yet implemented.

=head2 get_metanames

Returns SWISH::3::MetaNameHash object.

=head2 set_mimes

Not yet implemented.

=head2 get_mimes

Returns SWISH::3::xml2Hash object.

=head2 set_parsers

Not yet implemented.

=head2 get_parsers

Returns SWISH::3::xml2Hash object.

=head2 set_aliases

Not yet implemented.

=head2 get_aliases

Returns SWISH::3::xml2Hash object.

=head2 set_index

Not yet implemented.

=head2 get_index

Returns SWISH::3::xml2Hash object.

=head2 set_misc

Not yet implemented.

=head2 get_misc

Returns SWISH::3::xml2Hash object.

=head2 debug

=head2 add(I<file_or_xml>)

An alias for add() is merge().

=head2 delete

delete() is B<NOT YET IMPLEMENTED>.

=head2 read( I<filename> )

Returns SWISH::3::Config object.

=head2 write( I<filename> )

=head1 SWISH::3::Data

=head2 s3

Get the parent SWISH::3 object.

=head2 config

Get the parent SWISH::3::Config object.

=head2 property( I<name> )

Returns the string value of PropertyName I<name>.

=head2 metaname( I<name> )

Returns the string value of MetaName I<name>.

=head2 properties

Returns a hashref of name/value pairs.

=head2 metanames

Returns a hashref of name/value pairs.

=head2 doc

Returns a SWISH::3::Doc object.

=head2 tokens

Returns a SWISH::3::TokenIterator object.

=head1 SWISH::3::Doc

=head2 mtime

Returns the last modified time as epoch int.

=head2 size

Returns the size in bytes.

=head2 nwords

Returns the number of tokenized words in the Doc.

=head2 encoding

Returns the string encoding of Doc.

=head2 uri

Returns the URI value.

=head2 ext

Returns the file extension.

=head2 mime

Returns the mime type.

=head2 parser

Returns the name of the parser used (TXT, HTML, or XML).

=head2 action

Returns the intended action (e.g., add, delete, update).

=head1 SWISH::3::MetaName

=head2 new( I<name> )

Returns a new SWISH::3::MetaName instance. 

TODO: there are no set methods so this isn't of much use.

=head2 id

Returrns the id integer.

=head2 name

Returns the name string.

=head2 bias

Returns the bias integer.

=head2 alias_for

Returns the alias_for string.

=head1 SWISH::3::MetaNameHash

=head2 get( I<name> )

Get the SWISH::3::MetaName object for I<name>

=head2 set( I<name>, I<swish_3_metaname> )

Set the SWISH::3::MetaName for I<name>.

=head2 keys

Returns array of names.

=head1 SWISH::3::Property

=head2 id

Returns the id integer.

=head2 name

Returns the name string.

=head2 ignore_case

Returns the ignore_case boolean.

=head2 type

Returns the type integer.

=head2 verbatim

Returns the verbatim boolean.

=head2 max

Returns the max integer.

=head2 sort

Returns the sort boolean.

=head2 alias_for

Returns the alias_for string.

=head1 SWISH::3::PropertyHash

=head2 get( I<name> )

Get the SWISH::3::Property object for I<name>

=head2 set( I<name>, I<swish_3_property> )

Set the SWISH::3::Property for I<name>.

=head2 keys

Returns array of names.

=head1 SWISH::3::Stash

=head2 get( I<key> )

=head2 set( I<key>, I<value> )

=head2 keys

=head2 values

=head1 SWISH::3::Token

=head2 value

Returns the value string.

=head2 meta

Returns the SWISH::3::MetaName object for the Token.

=head2 meta_id

Returns the id integer for the related MetaName.

=head2 context

Returns the context string.

=head2 pos

Returns the position integer.

=head2 len

Returns the length in bytes of the Token.

=head1 SWISH::3::TokenIterator

=head2 next

Returns the next SWISH::3::Token.

=head1 SWISH::3::xml2Hash

=head2 get( I<key> )

=head2 set( I<key>, I<value> )

=head2 keys

=head1 SWISH::3::Constants

The following constants are imported directly from libswish3
and are defined there.

=over

=item SWISH_ALIAS

=item SWISH_BODY_TAG

=item SWISH_BUFFER_CHUNK_SIZE

=item SWISH_CASCADE_META_CONTEXT

=item SWISH_CLASS_ATTRIBUTES

=item SWISH_CONTRACTIONS

=item SWISH_DATE_FORMAT_STRING

=item SWISH_DEFAULT_ENCODING

=item SWISH_DEFAULT_METANAME

=item SWISH_DEFAULT_MIME

=item SWISH_DEFAULT_PARSER

=item SWISH_DEFAULT_PARSER_TYPE

=item SWISH_DEFAULT_VALUE

=item SWISH_DOM_CHAR

=item SWISH_DOM_STR

=item SWISH_ENCODING_ERROR

=item SWISH_ESTRAIER_FORMAT

=item SWISH_EXT_SEP

=item SWISH_FALSE

=item SWISH_FOLLOW_XINCLUDE

=item SWISH_HEADER_FILE

=item SWISH_HEADER_ROOT

=item SWISH_IGNORE_XMLNS

=item SWISH_INCLUDE_FILE

=item SWISH_INDEX

=item SWISH_INDEX_FILEFORMAT

=item SWISH_INDEX_FILENAME

=item SWISH_INDEX_FORMAT

=item SWISH_INDEX_LOCALE

=item SWISH_INDEX_STEMMER_LANG

=item SWISH_INDEX_NAME

=item SWISH_KINOSEARCH_FORMAT

=item SWISH_LATIN1_ENCODING

=item SWISH_LOCALE

=item SWISH_LUCY_FORMAT

=item SWISH_MAXSTRLEN

=item SWISH_MAX_FILE_LEN

=item SWISH_MAX_HEADERS

=item SWISH_MAX_SORT_STRING_LEN

=item SWISH_MAX_WORD_LEN

=item SWISH_META

=item SWISH_MIME

=item SWISH_MIN_WORD_LEN

=item SWISH_PARSERS

=item SWISH_PARSER_HTML

=item SWISH_PARSER_TXT

=item SWISH_PARSER_XML

=item SWISH_PATH_SEP_STR

=item SWISH_PREFIX_MTIME

=item SWISH_PREFIX_URL

=item SWISH_PROP

=item SWISH_PROP_DATE

=item SWISH_PROP_DBFILE

=item SWISH_PROP_DESCRIPTION

=item SWISH_PROP_DOCID

=item SWISH_PROP_DOCPATH

=item SWISH_PROP_ENCODING

=item SWISH_PROP_INT

=item SWISH_PROP_MIME

=item SWISH_PROP_MTIME

=item SWISH_PROP_NWORDS

=item SWISH_PROP_PARSER

=item SWISH_PROP_RANK

=item SWISH_PROP_RECCNT

=item SWISH_PROP_SIZE

=item SWISH_PROP_STRING

=item SWISH_PROP_TITLE

=item SWISH_RD_BUFFER_SIZE

=item SWISH_SPECIAL_ARG

=item SWISH_STACK_SIZE

=item SWISH_SWISH_FORMAT

=item SWISH_TITLE_METANAME

=item SWISH_TITLE_TAG

=item SWISH_TOKENIZE

=item SWISH_TOKENPOS_BUMPER

=item SWISH_TOKEN_LIST_SIZE

=item SWISH_TRUE

=item SWISH_UNDEFINED_METATAGS

=item SWISH_UNDEFINED_XML_ATTRIBUTES

=item SWISH_URL_LENGTH

=item SWISH_VERSION

=item SWISH_WORDS

=item SWISH_XAPIAN_FORMAT

=back

=head1 BUGS AND LIMITATIONS

libswish3 is not yet ported to Windows.

=head1 AUTHOR

Peter Karman C<< perl@peknet.com >>

=head1 COPYRIGHT

Copyright 2010 Peter Karman.

This file is part of libswish3.

libswish3 is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

libswish3 is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.


=head1 SEE ALSO

L<http://swish3.dezi.org/>, L<http://swish-e.org/>

L<SWISH::Prog>, L<Dezi::App>

=cut