The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -w

package main;

use warnings;
use strict;
use CAM::PDF;
use Getopt::Long;
use Pod::Usage;

our $VERSION = '1.52';

my %opts = (
            verbose    => 0,
            help       => 0,
            version    => 0,
            );

Getopt::Long::Configure('bundling');
GetOptions('v|verbose'  => \$opts{verbose},
           'h|help'     => \$opts{help},
           'V|version'  => \$opts{version},
           ) or pod2usage(1);
if ($opts{help})
{
   pod2usage(-exitstatus => 0, -verbose => 2);
}
if ($opts{version})
{
   print "CAM::PDF v$CAM::PDF::VERSION\n";
   exit 0;
}

if (@ARGV < 1)
{
   pod2usage(1);
}

my $file = shift;

my $doc = CAM::PDF->new($file) || die "$CAM::PDF::errstr\n";

my $pages = $doc->numPages();
my $nimages = 0;
for my $p (1..$pages)
{
   my $c = $doc->getPageContent($p);
   my @parts = split /(\/[\w]+\s*Do)\b/xms, $c;
   foreach my $part (@parts)
   {
      if ($part =~ /\A(\/[\w]+)\s*Do\z/xms)
      {
         $nimages++;
         my $ref = $1;
         my $xobj = $doc->dereference($ref, $p);
         my $objnum = $xobj->{objnum};
         my $im = $doc->getValue($xobj);
         my $l = $im->{Length} || $im->{L} || 0;
         if ($l)
         {
            $l = $doc->getValue($l);
         }
         my $w = $im->{Width} || $im->{W} || 0;
         if ($w)
         {
            $w = $doc->getValue($w);
         }
         my $h = $im->{Height} || $im->{H} || 0;
         if ($h)
         {
            $h = $doc->getValue($h);
         }
         print "Image $nimages page $p, (w,h)=($w,$h), ref $ref = object $objnum, length $l\n";
      }
      else
      {
         # Ths code may break if there is are legitimate strings "BI",
         # "ID" and "EI" in order in the page (which happened in the
         # PDF reference doc, of course!

       BI:
         while ($part =~ s/.*?\bBI\b\s*//xms)
         {
            my ($im) = $part =~ s/\A(.*?)\s*\bEI\b\s*//xms;
            next BI if (!$im);

            $im =~ s/\A.*\bBI\b//xms;  # this may get rid of a fake BI if there is one in the page

            # Easy tests:
            next BI if ($im =~ m/ \A [)] /xms);
            next BI if ($im =~ m/ [(] \z /xms);
            next BI if ($im !~ m/ \bID\b /xms);

            # make sure that there is an open paren before every close
            # if not, then the "BI" was part of a string
            my $test = $im;
            $test =~ s/ \\[()] //gxms; # get rid of escaped parens for the test
            while ($test =~ s/ \A(.*?) [)] //xms)
            {
               my $bit = $1;
               next BI if ($bit !~ m/ [(] /xms);
            }

            $nimages++;
            my $w = 0;
            my $h = 0;
            if ($im =~ m/ \/W(|idth)\s*(\d+) /xms)
            {
               $w = $2;
            }
            if ($im =~ m/ \/H(|eight)\s*(\d+) /xms)
            {
               $h = $2;
            }
            print "Image $nimages page $p, (w,h)=($w,$h), inline\n";
         }
      }
   }
}

__END__

=for stopwords listimages.pl

=head1 NAME

listimages.pl - Save copies of all PDF JPG images to a directory

=head1 SYNOPSIS

 listimages.pl [options] infile.pdf

 Options:
   -v --verbose        print diagnostic messages
   -h --help           verbose help message
   -V --version        print CAM::PDF version

=head1 DESCRIPTION

Searches the PDF for images and lists them on STDOUT in one of the
following formats:

  Image <n> page <p>, (w,h)=(<w>,<h>), ref <label> = object <objnum>, length <l>\n";
  Image <n> page <p>, (w,h)=(<w>,<h>), inline

=head1 SEE ALSO

CAM::PDF

F<crunchjpgs.pl>

F<extractallimages.pl>

F<extractjpgs.pl>

F<uninlinepdfimages.pl>

=head1 AUTHOR

See L<CAM::PDF>

=cut