The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -w

package main;

use warnings;
use strict;
use CAM::PDF;
use Getopt::Long;
use Pod::Usage;

our $VERSION = '1.59';

my %opts = (
            template   => 'crunchjpg_tmpl.pdf',

            verbose    => 0,
            help       => 0,
            version    => 0,
            skip       => {},
            only       => {},

            # Temporary values:
            onlyval    => [],
            skipval    => [],
            );

Getopt::Long::Configure('bundling');
GetOptions('S|skip=s'     => \@{$opts{skipval}},
           'O|only=s'     => \@{$opts{onlyval}},
           'v|verbose'  => \$opts{verbose},
           'h|help'     => \$opts{help},
           'V|version'  => \$opts{version},
           ) or pod2usage(1);
if ($opts{help})
{
   pod2usage(-exitstatus => 0, -verbose => 2);
}
if ($opts{version})
{
   print "CAM::PDF v$CAM::PDF::VERSION\n";
   exit 0;
}

foreach my $flag (qw( skip only ))
{
   foreach my $val (@{$opts{$flag.'val'}})
   {
      foreach my $key (split /\D+/xms, $val)
      {
         $opts{$flag}->{$key} = 1;
      }
   }
}
if (!-f $opts{template})
{
   die "Cannot find the template pdf called $opts{template}\n";
}

if (@ARGV < 2)
{
   pod2usage(1);
}

my $infile = shift;
my $outdir = shift;

my $doc = CAM::PDF->new($infile) || die "$CAM::PDF::errstr\n";

my $nimages = 0;
my $rimages = 0;
my %doneobjs;

foreach my $objnum (keys %{$doc->{xref}})
{
   my $xobj = $doc->dereference($objnum);
   if ($xobj->{value}->{type} eq 'dictionary')
   {
      my $im = $xobj->{value}->{value};
      if (exists $im->{Type} && $doc->getValue($im->{Type}) eq 'XObject' &&
          exists $im->{Subtype} && $doc->getValue($im->{Subtype}) eq 'Image')
      {
         my $ref = '(no name)';
         if ($im->{Name})
         {
            $ref = $doc->getValue($im->{Name});
         }
         my $w = $im->{Width} || $im->{W} || 0;
         if ($w)
         {
            $w = $doc->getValue($w);
         }
         my $h = $im->{Height} || $im->{H} || 0;
         if ($h)
         {
            $h = $doc->getValue($h);
         }

         next if (exists $doneobjs{$objnum});

         $nimages++;
         _inform("Image $nimages, $ref = object $objnum, (w,h)=($w,$h)", $opts{verbose});

         if (exists $opts{skip}->{$objnum} ||
             (0 < scalar keys %{$opts{only}} && !exists $opts{only}->{$objnum}))
         {
            _inform("Skipping object $objnum", $opts{verbose});
            next;
         }

         my $isjpg = _isjpg($im);

         my $oldsize = $doc->getValue($im->{Length});
         if (!$oldsize)
         {
            die "PDF error: Failed to get size of image\n";
         }

         my $tmpl = CAM::PDF->new($opts{template}) || die "$CAM::PDF::errstr\n";

         # Get a handle on the needed data bits from the template
         my $media_array = $tmpl->getValue($tmpl->getPage(1)->{MediaBox});
         my $rawpage = $tmpl->getPageContent(1);

         $media_array->[2]->{value} = $w;
         $media_array->[3]->{value} = $h;
         my $page = $rawpage;
         $page =~ s/xxx/$w/igxms;
         $page =~ s/yyy/$h/igxms;
         $tmpl->setPageContent(1, $page);
         $tmpl->replaceObject(9, $doc, $objnum, 1);

         my $ofile = "/tmp/crunchjpg.$$";
         $tmpl->cleanoutput($ofile);

         if (!-d $outdir)
         {
            require File::Path;
            File::Path::mkpath($outdir);
         }
         if ($isjpg)
         {
            my $result = `convert -quality 50 -density 72x72 -page ${w}x$h pdf:$ofile jpg:$outdir/$objnum.jpg`;  ## no critic (Backtick)
            _inform($result, $opts{verbose});
         }
         else
         {
            my $result = `convert -density 72x72 -page ${w}x$h pdf:$ofile gif:$outdir/$objnum.gif`;  ## no critic (Backtick)
            _inform($result, $opts{verbose});
         }

         $doneobjs{$objnum} = 1;
         $rimages++;
      }
   }
}

_inform("Extracted $rimages of $nimages images", $opts{verbose});


sub _isjpg
{
   my $im = shift;
   return if (!$im->{Filter});

   my $f = $im->{Filter};
   my @names = $f->{type} eq 'array' ? @{$f->{value}} : $f;
   for my $e (@names)
   {
      my $name = $doc->getValue($e);
      if (ref $name)
      {
         $name = $name->{value};
      }
      #warn "Checking $name\n";
      if ($name eq 'DCTDecode')
      {
         return 1;
      }
   }
   return;
}

sub _inform
{
   my $str     = shift;
   my $verbose = shift;

   if ($verbose)
   {
      print STDERR $str, "\n";
   }
   return;
}

__END__


=for stopwords extractallimages.pl ImageMagick

=head1 NAME

extractallimages.pl - Save copies of all PDF images to a directory

=head1 SYNOPSIS

 extractallimages.pl [options] infile.pdf outdirectory

 Options:
   -O --only=imnum     only output the specified images (can be used mutliple times)
   -S --skip=imnum     don't output the specified images (can be used mutliple times)
   -v --verbose        print diagnostic messages
   -h --help           verbose help message
   -V --version        print CAM::PDF version

C<imnum> is a comma-separated list of integers indicating the images
in order that they appear in the PDF.  Use F<listimages.pl> to retrieve
the image numbers.

=head1 DESCRIPTION

Requires the ImageMagick B<convert> program to be available

Searches the PDF for images and saves them as individual files in the
specified directory.  The files are named C<E<lt>imnumE<gt>.jpg> or C<E<lt>imnumE<gt>.gif>.

=head1 SEE ALSO

CAM::PDF

F<crunchjpgs.pl>

F<listimages.pl>

F<extractjpgs.pl>

F<uninlinepdfimages.pl>

=head1 AUTHOR

See L<CAM::PDF>

=cut