The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#
# PDF::Parse.pm, version 1.11 February 2000 antro
#
# Copyright (c) 1998 - 2000 Antonio Rosella Italy antro@tiscalinet.it, Johannes Blach dw235@yahoo.com 
#
# Free usage under the same Perl Licence condition.
#

package PDF::Parse;

$PDF::Parse::VERSION = "1.11";

=pod

=head1 NAME

PDF::Parse - Library with parsing functions for PDF library

=head1 SYNOPSIS

  use PDF::Parse;

  $pdf->TargetFile($filename);
  $pdf->LoadPageInfo;

  $version = $pdf->Version;
  $bool = $pdf->IsaPDF;
  $bool = $pdf->IscryptPDF;

  $info = $pdf->GetInfo ($key);
  $pagenum = $pdf->Pages;

  @size = $pdf->PageSize ($page);
  # or
  @size = $pdf->PageSize;

  $rotation = $pdf->PageRotation ($page);
  # or
  $rotation = $pdf->PageRotation;

=head1 DESCRIPTION

The main purpose of the PDF::Parse library is to provide parsing functions
for the more general PDF library.

=head1 Methods

The available methods are:

=cut

require 5.005;
require PDF::Core;

use strict;
use Carp;
use Exporter ();

use vars qw(@ISA @EXPORT_OK);

@ISA = qw(Exporter PDF::Core);

@EXPORT_OK = qw( LoadPageInfo GetInfo TargetFile
				 Pages PageSize PageRotation IsaPDF
				 Version IscryptPDF );

#################################################################
sub ReadCrossReference_pass1 {
  my $fd = shift;
  my $offset=shift;
  my $self=shift;

  my $initial_number;
  my $obj_counter=0;
  my $global_obj_counter=0;
  my $buf;

  binmode $fd;

  $_=PDF::Core::PDFGetline ($fd,\$offset);

  die "Can't read cross-reference section, according to trailer\n" if ! /xref\r?\n?/  ;

  while () {
    $_=PDF::Core::PDFGetline ($fd,\$offset);
    s/^\n//;
    s/^\r//;
    last if (m/\btrailer\b/) ;
#
# An Object
#
    /^\d+\s+\d+\s+n\r?\n?/ && do { my $buf =$_;
	       my $ind = $initial_number + ($obj_counter++);
               ( not defined $self->{Objects}[$ind] )&& 
		  do { $self->{Objects}[$ind] = int substr($buf,0,10);
		       $self->{Gen_Num}[$ind] = int substr($buf,11,5);
		     };
	       $_=$buf;
	       s/^.{18}//; 
	       next ;
   }; 
#
# A Freed Object
#
    /^\d+\s+\d+\s+f\r?\n?/ && do { my $buf =$_;
   	       my $objects_generation_nr = substr($buf,11,5);
	       my $Num=substr($buf,0,10);
	       my $ind = $initial_number + ($obj_counter++);
	       # $ind = $ind . "_" . $objects_generation_nr;
		       $self->{Objects}[$ind] = - $Num;
		       $self->{Gen_Num}[$ind] = $objects_generation_nr;
		       $_=$buf;
		       s/^.{18}//; 
		       next ;
     };
#
# A subsection
#
    /^\d+\s+\d+\r?\n?/  && do { 
 	my $buf = $_ ; 
 	 $initial_number = $buf; 
 	 $initial_number=~ s/^(\d+)\s+\d+\r?\n?.*/$1/; 
	 $global_obj_counter += $obj_counter;
 	 $obj_counter=0; 
	 next ;
    };
  }

  $global_obj_counter +=$obj_counter;
#
# Now the trailer for updates 
#

#
# Skip to start of dictionary.
#
    until (m/<</)
		{
		$_=PDF::Core::PDFGetline ($fd,\$offset);
		}

#
# Read the dictionary
#
    my %trailer = ( PDF::Core::PDFGetPrimitive ($fd, $offset) );

    if ($self->{"Trailer"}{"/Root"} eq "")
		{
		$self->{"Trailer"} = \%trailer;
		#
		# This code is here for backward compatibility only. If the content
		# of the root trailer is needed, use $self->{"Trailer"} instead.
		#
		$self->{"Cross_Reference_Size"} = $trailer{"/Size"};
		$self->{"Root_Object"} = $trailer{"/Root"};
		$self->{"Crypt_Object"} = $trailer{"/Encrypt"};
		}
	if ($trailer{"/Prev"} =~ m/^\d+$/)
		{  
  		$self->{"Updated"} = 1;
		my $old_seek = tell $fd;
		$global_obj_counter += ReadCrossReference_pass1 ($fd,
            $trailer{"/Prev"}, $self );
		seek $fd, $old_seek, 0;
		}


  return $global_obj_counter;
}

#################################################################
sub LoadPageSubtree (\*$;%)
	{
	my $self = shift;
	my $ref = shift;
	my %inheritance = @_ ;

	my $data = $self->GetObject ($ref);

	# Check which attributes are inherited. Adobe did not add any new
	# inherited attributes in version 1.2 or later, so this list is
	# complete.

	# Do simple values.
	foreach my $key ("/Rotate", "/Dur", "/Hid", "/Trans", 
					 "/MediaBox", "/CropBox")
		{
		if (defined ($data->{$key}))
			{
			# Check if it is an indirect reference
			if ($data->{$key} =~ m/^\d+ \d+ R$/)
				{
				my $dataref = $data->{$key};
				do
					{
					$dataref = $self->GetObject ($dataref);
					}
				while ($dataref =~ m/^\d+ \d+ R$/);

				if (UNIVERSAL::isa ($data, "ARRAY"))
					{
					$inheritance{$key} = [];
					foreach my $i (@{$data})
						{
						# Each element may be a reference.
						while ($i =~ m/^\d+ \d+ R$/)
							{
							$i = $self->GetObject ($i);
							}

						push @{$inheritance{$key}}, $i;
						}
					}
				else
					{
					$inheritance{$key} = $dataref;
					}
				}
			else
				{
				$inheritance{$key} = $data->{$key};
				}
			}
		}

	# If this objects contains ressources, replace information in inheritance
	$inheritance{"Resource_Object"} = $data->{"/Resources"}
	    if (defined ($data->{"/Resources"}));

	if ($data->{"/Type"} eq "/Pages")
		{
		# It's just an intermediate Node
		foreach my $kid (@{$data->{"/Kids"}})
			{
			$self->LoadPageSubtree ($kid, %inheritance);
			}
		}
	elsif ($data->{"/Type"} eq "/Page")
		{
		# We have a real page!
		$inheritance{"Page_Object"} = $ref;
		push @{$self->{"Page"}}, +{ %inheritance };
		}
	else
		{
		# Strange stuff. Complain and discard.
		carp "While loading pages got object of type '", $data->{"/Type"}, "'";
		}
	}

#################################################################
=pod

=head2 TargetFile ( filename )

This method links the filename to the pdf descriptor and parses all
kind of header information.

=cut

sub TargetFile {
  my $self = shift;
  my $file = shift;

  croak "Already linked to the file ",$self->{File_Name},"\n" 
      if $self->{File_Name} ;
  
  my $offset;

  if ( $file ) {
    open(FILE, "< $file") or croak "can't open $file: $!";
    binmode FILE;
    $self->{File_Name} = $file ;
    $self->{File_Handler} = \*FILE;
    my $buf;
    read(FILE,$buf,4);
    if ( $buf ne "%PDF" ) {
     print "File $_[0] is not PDF compliant !\n" if $PDF::Verbose ;
     return 0 ;
    }
    read(FILE,$buf,4);
    $buf =~ s/-//;
    $self->{Header}= $buf;
    seek FILE,-50,2;
    read( FILE, $offset, 50 );
    $offset =~ s/[^s]*startxref\r?\n?(\d*)\r?\n?%%EOF\r?\n?/$1/;

	$self->{"Last_XRef_Offset"} = $offset;
    ReadCrossReference_pass1 (\*FILE, $offset, $self);
	$self->{"Info"} = $self->GetObject ($self->{"Trailer"}{"/Info"});
	$self->{"Catalog"} = $self->GetObject ($self->{"Trailer"}{"/Root"});
	$self->{"PageTree"} = $self->GetObject ($self->{"Catalog"}{"/Pages"});
    return 1;
  } else {
    croak "I need a file name (!)";
	}
}

#################################################################
=pod

=head2 LoadPageInfo

This function loads the information for all pages. This process can
take some time for big PDF-files.

=cut

sub LoadPageInfo (\*)
	{
	my $self = shift;

	# Reset Page Array
	$#{$self->{"Page"}} = -1;

	# Recurse
	$self->LoadPageSubtree ($self->{"Catalog"}{"/Pages"});
	}								



#################################################################
=pod

=head2 Version

Returns the PDF version used for writing the object file.

=cut

sub Version { 
  return ($_[0]->{Header}); 
}

#################################################################
=pod

=head2 IsaPDF

Returns true, if the file could be parsed and is a PDF-file.

=cut

sub IsaPDF { 
  return ($_[0]->{Header} != undef) ; 
}

#################################################################
=pod

=head2 IscryptPDF

Returns true if the PDF contains a crypt object. This indicates that
the data of the PDF-File is encrypted. In this case, not all function
work as expected.

=cut

sub IscryptPDF { 
  return ($_[0]->{Crypt_Object} != undef) ; 
}

#################################################################
=pod

=head2 GetInfo ( key )

Returns the various information contained in the info section of a PDF
file (if present). A PDF file can have:

  a title ==> GetInfo ("Title")
  a subject ==> GetInfo ("Subject")
  an author ==> GetInfo("Author")
  a creation date ==> GetInfo("CreationDate")
  a creator ==> GetInfo("Creator")
  a producer ==> GetInfo("Producer")
  a modification date ==> GetInfo("ModDate")
  some keywords ==> GetInfo("Keywords")

=cut

sub GetInfo (\*$)
	{
	my $self = shift;
	my $type = shift;

	return PDF::Core::UnQuoteString ($self->{"Info"}{"/" . $type})
	}

#################################################################
=pod

=head2 Pages

Returns the number of pages of the PDF-file.

=cut

sub Pages 
	{
	my $self = shift;

	return $self->{"PageTree"}{"/Count"};
	}

#################################################################
=pod

=head2 PageSize ( [ page ] )

Returns the size of a page in the PDF-file. If no parameter is given,
the default size of the root page will be returned. This value may be
overridden for any page.

If the size of an individual page is requested and the page data is
not already loaded, the method B<LoadPageInfo> will be executed. This
may take some time for large PDF-files. The size of the root page is
always available and will never execute B<LoadPageInfo>.

=cut

sub PageSize (;$)
	{
	my $self = shift;
	my $page = shift;

	if ($page > 0)
		{
		return undef if ($page > $self->{"PageTree"}{"/Count"});
		$self->LoadPageInfo unless ($#{$self->{"Page"}} >= 0);
		
		return @{$self->{"Page"}[$page - 1]{"/MediaBox"}}
		if (defined $self->{"Page"}[$page - 1]{"/MediaBox"});
		}
	else
		{
		return @{$self->{"PageTree"}{"/MediaBox"}}
		if (defined $self->{"PageTree"}{"/MediaBox"});
		}

	return undef;
	}

#################################################################
=pod

=head2 PageRotation ( [ page ] )

Returns the rotation of a page in the PDF-file. If no parameter is given,
the default rotation of the root page will be returned. This value may be
overridden for any page.

If the rotation of an individual page is requested and the page data is
not already loaded, the method B<LoadPageInfo> will be executed. This
may take some time for large PDF-files. The rotation of the root page is
always available and will never execute B<LoadPageInfo>.

=cut
sub PageRotation (;$)
	{
	my $self = shift;
	my $page = shift;

	my $rotate = 0;

	if ($page > 0)
		{
		return undef if ($page > $self->{"PageTree"}{"/Count"});
		$self->LoadPageInfo unless ($#{$self->{"Page"}} >= 0);
		
		$rotate = $self->{"Page"}[$page - 1]{"/Rotate"};
		}
	else
		{
		$rotate = $self->{"PageTree"}{"/Rotate"};
		}

	print "Rotation ", 0 + $rotate if ($PDF::Verbose);

	return 0 + $rotate;
	}
#################################################################
1;
__END__

=head1 Variables

The only available variable is :

=over

=item B<$PDF::Parse::VERSION>

Contains the version of the library installed

=back


=head1 Copyright

  Copyright (c) 1998 - 2000 Antonio Rosella Italy antro@tiscalinet.it, Johannes Blach dw235@yahoo.com 

This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=head1 Availability

The latest version of this library is likely to be available from:

http://www.geocities.com/CapeCanaveral/Hangar/4794/

=cut