The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
use strict;
use warnings;

###################################################################################
#
#    Copyright (C) 2001 Bill Moseley swishscript@hank.org
#    Program to test the SWISH::Filter module
#
#    This program is free software; you can redistribute it and/or
#    modify it under the terms of the GNU General Public License
#    as published by the Free Software Foundation; either version
#    2 of the License, or (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    The above lines must remain at the top of this program
#
#
#
#   5 July 2006 - perl@peknet.com added --ignore_filters option
#   13 July 2006 - perl@peknet.com added metadata dump
#
####################################################################################

use Getopt::Long;
use SWISH::Filter;
use Pod::Usage;
use URI;
use Data::Dumper;

use constant ABORT => 0;
use constant DEBUG => 1;
use constant INFO  => 2;

my $version = '0.02';

my (
    $verbose, $show_content, @file,  @url,
    $help,    $man,          $quiet, $headers,
    $path,    $depreciated,  $mimetypes
   );

my $skip_binary = 1;

my $lines          = 10;
my $max_chars      = 1000;
my @ignore_filters = ();

GetOptions(
    'verbose!'          => \$verbose,         # turn on INFO messages
    'content!'          => \$show_content,
    'quiet'             => \$quiet,
    'lines=i'           => \$lines,
    'help|?'            => \$help,
    'man'               => \$man,
    'headers'           => \$headers,
    'skip_binary!'      => \$skip_binary,
    'path'              => \$path,
    'depreciated'       => \$depreciated,
    'mimetypes'         => \$mimetypes,
    'ignore_filters=s@' => \@ignore_filters

          ) || pod2usage(2);
pod2usage(-verbose => 1) if $help;
pod2usage(-verbose => 2) if $man;

if ($path)
{
    print '/usr/local/lib/swish-e/perl', "\n";
    exit;
}

pod2usage(
          -verbose => 0,
          -message => "Must specify a file or URL",
          -exitvar => 1,
         )
  unless @ARGV or $mimetypes;

$ENV{FILTER_DEBUG} = 1 if $verbose;    # used by SWISH::Filter

msg(INFO, "SWISH::Filter found at [%s]\n", $INC{'SWISH/Filter.pm'});

my $filter = SWISH::Filter->new(ignore_filters => \@ignore_filters);

if ($mimetypes)
{
    my @filters = $filter->filter_list;
    print "Mimetypes:\n\n";
    for my $filter (@filters)
    {
        print "  $filter:\n";
        for my $pattern ($filter->mimetypes)
        {
            print "     $pattern\n";
        }
        print "\n";
    }
}

my $return = 0;

for my $doc (@ARGV)
{
    eval { $depreciated ? process_doc_old($doc) : process_doc($doc) };
    $return = 1 if $@;
    warn "** $0:\n  $@\n" if $@;    # always warn on die
}

exit $return;

sub process_doc
{
    my ($file) = @_;

    my $uri;
    eval { $uri = URI->new($file) };
    my %config = !$@ && $uri->scheme ? fetch_url($file) : fetch_file($file);

    my $doc =
      $filter->convert(
                       %config,
                       name      => $file,
                       meta_data => {'convertedBy' => 'swish-filter-test'}
                      );

    die "Failed to process document [$file]\n" unless $doc;

    my $content_type = $doc->content_type      || "unknown";
    my $parser_type  = $doc->swish_parser_type || '';

    my $binary = $doc->is_binary;

    my $msg = $doc->was_filtered ? '' : 'not';

    my $name = $doc->name;

    msg(DEBUG, <<EOF );

Document $file was $msg filtered.
   Document:     $file  ($name)
   Content-Type: $content_type
   Parser type:  $parser_type
EOF

    if (my $filters_used = $doc->filters_used)
    {
        for my $filter (@$filters_used)
        {
            my $class = ref($filter->{name});
            msg(DEBUG,
                "   >Filter used: $class  ( $filter->{start_content_type} -> $filter->{end_content_type} )"
               );
            msg(DEBUG, "   >Metadata: " . Dumper($doc->meta_data || 'none'));
        }
    }

    if (!$binary)
    {
        my @doc = split /\n/, substr(${$doc->fetch_doc}, 0, $max_chars);
        my $toshow = $lines;
        $toshow = scalar(@doc) - 1 if $toshow >= @doc;
        msg(INFO, join "\n",
            '-- Output Content Sample --',
            @doc[0 .. $toshow],
            '', '-- end --', '');
    }

    die "Skipping binary [$file]\n" if $binary && $skip_binary;

    if ($headers)
    {
        my $len = length ${$doc->fetch_doc};

        print "Path-Name: $file\nContent-Length: $len\n";
        print "Document-Type: $parser_type\n" if $parser_type;
        print "\n";
    }

    print ${$doc->fetch_doc} if $show_content;
}

sub process_doc_old
{
    my ($file) = @_;

    my $uri;
    eval { $uri = URI->new($file) };
    my %config = !$@ && $uri->scheme ? fetch_url($file) : fetch_file($file);

    my $was_filtered = $filter->filter(%config, name => $file,);

    my $content_type      = $filter->content_type          || "unknown";
    my $orig_content_type = $filter->original_content_type || "unknown";
    my $parser_type       = $filter->swish_parser_type     || '';

    my $binary = $content_type !~ m[^text/];

    my $msg = $was_filtered ? '' : 'not';

    msg(DEBUG, <<EOF );

Document $file was $msg filtered.
   Document:     $file
   Content-Type: $content_type  (initial was $orig_content_type)
   Parser type:  $parser_type
EOF

    if (!$binary)
    {
        my @doc = split /\n/, substr(${$filter->fetch_doc}, 0, $max_chars);
        $lines = @doc - 1 if $lines >= @doc;
        msg(INFO, join "\n",
            '-- Output Content Sample --',
            @doc[0 .. $lines],
            '', '-- end --', '');
    }

    die "Skipping binary [$file]\n" if $binary && $skip_binary;

    if ($headers)
    {
        my $len = length ${$filter->fetch_doc};

        print "Path-Name: $file\nContent-Length: $len\n";
        print "Document-Type: $parser_type\n" if $parser_type;
        print "\n";
    }

    print ${$filter->fetch_doc} if $show_content;
}

sub fetch_file
{
    my $file = shift;

    # just try to open for error reporting
    open FH, "<$file" or die "Failed to open '$file': $!\n";

    close FH;

    die "File '$file' has zero size\n" if -z $file;
    return (document => $file);

}

sub fetch_url
{
    my $url = shift;

    eval { require LWP::UserAgent };
    die "LWP::UserAgent is required to fetch a URL\n$@\n" if $@;

    my $ua       = LWP::UserAgent->new;
    my $request  = HTTP::Request->new('GET', $url);
    my $response = $ua->request($request);
    die sprintf "Error while getting '%s'.  Server returned %s.",
      $response->request->uri, $response->status_line
      unless $response->is_success;

    my $content      = $response->content;
    my $content_type = $response->content_type;

    return (
            document     => \$content,
            content_type => $content_type,
           );
}

sub msg
{
    my $msg_level = shift;

    return if $quiet;
    return if !$verbose && $msg_level > DEBUG;
    printf(STDERR @_), print STDERR "\n";
}

__END__

=head1 NAME

swish-filter-test - program to test the SWISH::Filter module.

=head1 SYNOPSIS

swish-filter-test [options] <file or url>  <...>

 Options:
   -quiet            don't generate messages to stderr
   -content          output content to stdout
   -(no)skip_binary  skip output of binary files (default)
   -lines <num>      Number of lines of content to display to stderr if verbose
   -headers          output with headers for swish-e -S prog method
   -verbose          enable $ENV{FILTER_DEBUG} for verbose output
   -path             output @INC path to SWISH::Filter module
   -help             brief help message
   -man              full documentation
   -ignore <filter>  ignore <filter> in selecting a filter to use

=head1 DESCRIPTION

swish-filter-test is a program to test the Perl module SWISH::Filter.
SWISH::Filter is a module that is included with the swish-e distribution.

Documents supplied to this script (as a URL or a plain file) on the command line
are passed to the SWISH::Filter module.  This is useful for testing filters.

The SWISH::Filter module works by checking a document's content-type and looking
for an available filter.  Most filters require additional helper programs (e.g.
the filter to convert PDF to HTML requires the Xpdf package).  Using the -verbose
options should indicate if a required program is missing.

Options to this script control how much output is generated.  Options can also be specified
to generate output that can be piped directly to swish-e (see C<-headers> below).

All messages are sent to stderr unless --content or -headers are specified and then content
is sent to stdout.


=head1 OPTIONS

=over 8

=item B<-quiet>

Don't write info out to stderr.  Normally not useful unless you just want to filter
a document and not really test the SWISH::Filter module.

Fatal errors are written to
stderr error regardless of the -quiet option.

=item B<-verbose>

Enables FILTER_DEBUG mode in the SWISH::Filter module, and enables extra info
including a summary of the filtered document to stderr.  Enable if trying to find
out why a filter does not work.


=item B<-lines>

Number of summary lines of summary content to show.  Summary lines are only showed if -verbose
is selected.  Lines are sent to stderr, not stdout.

Note, the summary is limited to 1000 characters regardless of this option.

=item B<-content>

Specifying -content will output full content to stdout.  The default is to only display
a few lines.

=item B<-(no)skip_binary>

The default is to not output content from binary files.  -noskip_binary will disable this.

=item B<-headers>

Prints the headers used by swish-e when reading documents with -S prog.  This can be used to
filter documents directly to swish-e:

 swish-filter-test -headers -content http://localhost/ test.pdf | swish-e -S prog -i stdin -v1

=item B<-path>

Prints the installed location of the SWISH::Filter parent directory for use in PERL5LIB,
Allows using SWISH::Filter in other programs, or with the Swish-e -S http method with
swishspider.

For example:

   PERL5LIB=`swish-filter-test -path` swish-e -S http -i http://localhost


=item B<-help>

Print a brief help message and exits.

=item B<-man>

Prints the manual page and exits.

=back


=cut