The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package PDF::FromHTML;
$PDF::FromHTML::VERSION = '0.12';

use 5.006;
use strict;
use warnings;

BEGIN {
    foreach my $method ( qw( pdf twig tidy args ) ) {
        no strict 'refs';
        *$method = sub { $#_ ? ($_[0]{$method} = $_[1]) : $_[0]{$method} };
    }
}

use Cwd;
use File::Temp;
use File::Basename;

use PDF::Writer;
use PDF::Template;
use PDF::FromHTML::Twig;

use constant HAS_UNICODE_SUPPORT => ($] >= 5.008);

use constant PDF_WRITER_BACKEND => do {
    local $@;

    # For Perl 5.6.x, we have to use pdflib
    PDF::Writer->import('pdflib')
        unless HAS_UNICODE_SUPPORT();

    eval { ref(PDF::Writer->new) }
        or die( "Please install PDF::API2 (preferred) or pdflib_pl first" );
};
use constant HAS_HTML_TIDY => do {
    local $@;
    eval { require HTML::Tidy; 1 } or do {
        unless ( eval { require XML::Clean; 1 } ) {
            die( "Please install HTML::Tidy (preferred) or XML::Clean first" );
        }
        0; # Has XML::Clean but no HTML::Tidy
    };
};

=head1 NAME

PDF::FromHTML - Convert HTML documents to PDF

=head1 VERSION

This document describes version 0.12 of PDF::FromHTML,
released December 10, 2005.

=head1 SYNOPSIS

    my $pdf = PDF::FromHTML->new( encoding => 'utf-8' );
    $pdf->load_file('source.html');
    $pdf->convert(
        Font => '/path/to/font.ttf',
        LineHeight => 10,
        Landscape => 1,
    );
    $pdf->write_file('target.pdf');

=head1 DESCRIPTION

This module transforms HTML into PDF, using an assortment of XML
transformations implemented in L<PDF::FromHTML::Twig>.

There is also a command-line utility, L<html2pdf.pl>, that comes
with this distribution.

=head1 PUBLIC METHODS

=cut

sub new {
    my $class = shift;
    bless({
        twig => PDF::FromHTML::Twig->new,
        args => { @_ },
    }, $class);
}

sub load_file {
    my ($self, $file) = @_;
    $self->{file} = $file;
}

sub parse_file {
    my $self = shift;
    my $file = $self->{file};
    my $content = '';

    my $dir = Cwd::getcwd();

    if (!ref $file) {
        open my $fh, $file or die $!;
        chdir File::Basename::dirname($file);
        $content = do { local $/; <$fh> };
    }
    else {
        $content = $$file;
    }

    my $encoding = ($self->args->{encoding} || 'utf8');
    if (HAS_UNICODE_SUPPORT() and $self->args) {
        require Encode;
        $content = Encode::decode($encoding, $content, Encode::FB_XMLCREF());
    }

    $content =~ s{&nbsp;}{}g;
    $content =~ s{<!--.*?-->}{}gs;

    if (HAS_HTML_TIDY()) {
        if (HAS_UNICODE_SUPPORT()) {
            $content = Encode::encode( ascii => $content, Encode::FB_XMLCREF());
        }
        $content = HTML::Tidy->new->clean(
            '<?xml version="1.0" encoding="UTF-8"?>',
            '<html xmlns="http://www.w3.org/1999/xhtml">',
            $content,
        );
    }
    else {
        $content =~ s{&#(\d+);}{chr $1}eg;
        $content =~ s{&#x([\da-fA-F]+);}{chr hex $1}eg;
        $content = XML::Clean::clean($content, '1.0', { encoding => 'UTF-8' });
        $content =~ s{<(/?\w+)}{<\L$1}g;
    }

    $self->twig->parse( $content );

    chdir $dir;
}

=head2 convert(%params)

Convert the loaded file to PDF.  Valid parameters are:

    PageWidth         640
    PageResolution    540
    FontBold          'HelveticaBold'
    FontOblique       'HelveticaOblique'
    FontBoldOblique   'HelveticaBoldOblique'
    LineHeight        12
    FontUnicode       'Helvetica'
    Font              (same as FontUnicode)
    PageSize          'A4'
    Landscape         0

=cut

sub convert {
    my ($self, %args) = @_;

    {
        # import arguments into Twig parameters
        no strict 'refs';
        ${"PDF::FromHTML::Twig::$_"} = $args{$_} foreach keys %args;
    }

    $self->parse_file;

    my ($fh, $filename) = File::Temp::tempfile(
        SUFFIX => '.xml',
        UNLINK => 1,
    );

    binmode($fh);
    if (HAS_UNICODE_SUPPORT()) {
        binmode($fh, ':utf8');
    }

    # XXX HACK! XXX
    my $text = $self->twig->sprint;
    $text =~ s{\$(__[A-Z_]+__)}{<var name='$1' />}g;
    print $fh $text;
    close $fh;

    # print STDERR "==> Temp file written to $filename\n";

    local $^W;
    $self->pdf(eval { PDF::Template->new( filename => $filename ) })
      or die "$filename: $@";
    $self->pdf->param(@_);
}

sub write_file {
    my $self = shift;
    local $^W;
    $self->pdf->write_file(@_);
}

1;

=head1 HINTS & TIPS

=head2 E<lt>imgE<gt> tags

Add the height and width attributes if you are creating the source HTML,
it keeps PDF::FromHTML from having to open and read the source image file
to get the real size.  Less file I/O means faster processing.

=head1 SEE ALSO

L<PDF::FromHTML::Twig>, L<PDF::Template>, L<XML::Twig>.

=head1 AUTHORS

Audrey Tang E<lt>autrijus@autrijus.orgE<gt>

=head1 CONTRIBUTORS

Charleston Software Associates E<lt>info@charletonsw.comE<gt>

=head1 COPYRIGHT

Copyright 2004, 2005 by Audrey Tang E<lt>autrijus@autrijus.orgE<gt>.

This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

See L<http://www.perl.com/perl/misc/Artistic.html>

=cut