lib/SWISH/Prog/Aggregator/Mail.pm

package SWISH::Prog::Aggregator::Mail;
use strict;
use warnings;

use Carp;
use Data::Dump qw( dump );
use Search::Tools::XML;
use Mail::Box::Manager;
use base qw( SWISH::Prog::Aggregator );

our $VERSION = '0.70';

my $XMLer = Search::Tools::XML->new();

=pod

=head1 NAME

SWISH::Prog::Aggregator::Mail - crawl a mail box

=head1 SYNOPSIS
    
    use SWISH::Prog::Aggregator::Mail;
    
    my $aggregator = 
        SWISH::Prog::Aggregator::Mail->new( 
            indexer => SWISH::Prog::Indexer::Native->new()
        );
    
    $aggregator->indexer->start;
    $aggregator->crawl('path/to/my/maildir');
    $aggregator->indexer->finish;


=head1 DESCRIPTION

SWISH::Prog::Aggregator::Mail is a SWISH::Prog::Aggregator
subclass designed for providing full-text search for your email.

SWISH::Prog::Aggregator::Mail uses Mail::Box, available from CPAN.

=head1 METHODS

Since SWISH::Prog::Aggregator::Mail inherits from SWISH::Prog::Aggregator, 
read the SWISH::Prog::Aggregator docs first. 
Any overridden methods are documented here.

=head2 init

Adds the special C<mail> MetaName to the Config object before
opening indexer.

=cut

sub init {
    my $self = shift;
    $self->SUPER::init(@_);

    # add top-level metaname
    $self->config->MetaNameAlias('swishdefault mail');

    my @meta = qw(
        url
        id
        subject
        date
        size
        from
        to
        cc
        bcc
        type
        part
    );

    $self->config->MetaNames(@meta);
    $self->config->PropertyNames(@meta);

    # save all body text in the swishdescription property for excerpts
    $self->config->StoreDescription('XML* <body>');

}

# basic flow:
# recurse through maildir, get all messages,
# convert each message to xml, create Doc object and call index()

=head2 crawl( I<path_to_maildir> )

Create index. 

Returns number of emails indexed.

=cut

sub crawl {
    my $self    = shift;
    my $maildir = shift or croak "maildir required";
    my $manager = Mail::Box::Manager->new;

    $self->{count} = 0;

    my $folder = $manager->open(
        folderdir => $maildir,
        folder    => '=',
        extract   => 'ALWAYS'
    ) or croak "can't open $maildir";

    $self->_process_folder($folder);

    $folder->close( write => 'NEVER' );

    return $self->{count};
}

sub _addresses {
    return join( ', ', map { ref($_) ? $_->format : $_ } @_ );
}

sub _process_folder {
    my $self = shift;
    my $folder = shift or croak "folder required";

    my @subs    = sort $folder->listSubFolders;
    my $indexer = $self->indexer;

    for my $sub (@subs) {
        my $subf = $folder->openSubFolder($sub);

        warn "searching $sub\n" if $self->verbose;

        foreach my $message ( $subf->messages ) {
            my $doc = $self->get_doc( $sub, $message );
            $indexer->process($doc);
            $self->_increment_count;
        }

        $self->_process_folder($subf);

        $subf->close( write => 'NEVER' );
    }

}

sub _filter_attachment {
    my $self    = shift;
    my $msg_url = shift or croak "message url required";
    my $attm    = shift or croak "attachment object required";

    my $type     = $attm->body->mimeType->type;
    my $filename = $attm->body->dispositionFilename;
    my $content  = $attm->decoded . '';                # force stringify

    if ( $self->swish_filter_obj->can_filter($type) ) {

        my $f = $self->swish_filter_obj->convert(
            document     => \$content,
            content_type => $type,
            name         => $filename,
        );

        if (   !$f
            || !$f->was_filtered
            || $f->is_binary )    # is is_binary necessary?
        {
            warn "skipping $filename in message $msg_url - filtering error\n";
            return '';
        }

        $content = ${ $f->fetch_doc };
    }

    return join( '',
        '<title>',  $XMLer->escape($filename),
        '</title>', $XMLer->escape($content) );

}

=head2 get_doc( I<folder>, I<Mail::Message> )

Extract data and content from I<Mail::Message> in I<folder> and return
doc_class() object.

=cut

sub get_doc {
    my $self    = shift;
    my $folder  = shift or croak "folder required";
    my $message = shift or croak "mail meta required";

    # >head->createFromLine;
    my %meta = (
        url => join( '.', $folder, $message->messageId ),
        id  => $message->messageId,
        subject => $message->subject || '[ no subject ]',
        date    => $message->timestamp,
        size    => $message->size,
        from    => _addresses( $message->from ),
        to      => _addresses( $message->to ),
        cc      => _addresses( $message->cc ),
        bcc     => _addresses( $message->bcc ),
        type    => $message->contentType,
    );

    my @parts = $message->parts;

    for my $part (@parts) {
        push(
            @{ $meta{parts} },
            $self->_filter_attachment( $meta{url}, $part )
        );
    }

    my $title = $meta{subject};

    my $xml = $self->_mail2xml( $title, \%meta );

    my $doc = $self->doc_class->new(
        content => $xml,
        url     => $meta{url},
        modtime => $meta{date},
        parser  => 'XML*',
        type    => 'application/xml',
        data    => \%meta
    );

    return $doc;
}

sub _mail2xml {
    my $self  = shift;
    my $title = shift;
    my $meta  = shift;

    my $xml
        = "<mail>"
        . "<swishtitle>"
        . $XMLer->utf8_safe($title)
        . "</swishtitle>"
        . "<head>";

    for my $m ( sort keys %$meta ) {

        if ( $m eq 'parts' ) {

            $xml .= '<body>';
            for my $part ( @{ $meta->{$m} } ) {
                $xml .= '<part>';
                $xml .= $part;
                $xml .= '</part>';
            }
            $xml .= '</body>';
        }
        else {
            $xml .= $XMLer->start_tag($m);
            $xml .= $XMLer->escape( $meta->{$m} );
            $xml .= $XMLer->end_tag($m);
        }
    }

    $xml .= "</head></mail>";

    return $xml;
}

1;

__END__

=head1 AUTHOR

Peter Karman, E<lt>perl@peknet.comE<gt>

=head1 BUGS

Please report any bugs or feature requests to C<bug-swish-prog at rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=SWISH-Prog>.  
I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc SWISH::Prog


You can also look for information at:

=over 4

=item * Mailing list

L<http://lists.swish-e.org/listinfo/users>

=item * RT: CPAN's request tracker

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=SWISH-Prog>

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/SWISH-Prog>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/SWISH-Prog>

=item * Search CPAN

L<http://search.cpan.org/dist/SWISH-Prog/>

=back

=head1 COPYRIGHT AND LICENSE

Copyright 2008-2009 by Peter Karman

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself. 

=head1 SEE ALSO

L<http://swish-e.org/>
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)