package PDF::FromHTML::Twig;
use strict;
use warnings;
use XML::Twig;
use base 'XML::Twig';
use charnames ':full';
use Graphics::ColorNames qw( hex2tuple );
use File::Spec;
use File::Basename;
use List::Util qw( sum first reduce );
=head1 NAME
PDF::FromHTML::Twig - PDF::FromHTML guts
=head1 SYNOPSIS
(internal use only)
=head1 DESCRIPTION
No user-serviceable parts inside.
=cut
sub new {
my $class = shift;
XML::Twig::new($class, $class->TwigArguments, @_);
}
our $PageWidth = 640;
our $PageResolution = 540;
our $FontBold = 'HelveticaBold';
our $FontOblique = 'HelveticaOblique';
our $FontBoldOblique = 'HelveticaBoldOblique';
our $LineHeight = 12;
our $FontUnicode = 'Helvetica';
our $Font = $FontUnicode;
# $Font = '/usr/local/share/fonts/TrueType/minguni.ttf';
our $PageSize = 'A4';
our $Landscape = 0;
use constant SuperScript => [
"\N{SUPERSCRIPT ZERO}",
"\N{SUPERSCRIPT ONE}",
"\N{SUPERSCRIPT TWO}",
"\N{SUPERSCRIPT THREE}",
"\N{SUPERSCRIPT FOUR}",
"\N{SUPERSCRIPT FIVE}",
"\N{SUPERSCRIPT SIX}",
"\N{SUPERSCRIPT SEVEN}",
"\N{SUPERSCRIPT EIGHT}",
"\N{SUPERSCRIPT NINE}",
];
use constant SubScript => [
"\N{SUBSCRIPT ZERO}",
"\N{SUBSCRIPT ONE}",
"\N{SUBSCRIPT TWO}",
"\N{SUBSCRIPT THREE}",
"\N{SUBSCRIPT FOUR}",
"\N{SUBSCRIPT FIVE}",
"\N{SUBSCRIPT SIX}",
"\N{SUBSCRIPT SEVEN}",
"\N{SUBSCRIPT EIGHT}",
"\N{SUBSCRIPT NINE}",
];
use constant InlineTags => { map { $_ => 1 } '#PCDATA', 'font' };
use constant DeleteTags => {
map { $_ => 1 }
qw(
head style applet script
)
};
use constant IgnoreTags => {
map { $_ => 1 }
qw(
title a ul
del address blockquote colgroup fieldset
input form frameset object noframes noscript
small optgroup isindex area textarea col
pre frame param menu acronym abbr bdo
label basefont big caption option cite
dd dfn dt base code map iframe ins kbd legend
samp span dir strike meta link tbody q tfoot
button thead tt select s
var
)
};
use constant TwigArguments => (
twig_handlers => {
html => sub {
$_->del_atts;
$_->set_gi('pdftemplate');
},
map((
"h$_" => (
sub {
my $size = 4 + shift;
sub {
$_->insert_new_elt(before => 'textbox')
->wrap_in('row')
->wrap_in(font => { face => $FontBold });
$_->wrap_in(
font => { h => $LineHeight + 6 - $size });
$_->wrap_in(
row => { h => $LineHeight + 8 - $size });
$_->set_tag('textbox'), $_->set_att(w => '100%');
};
}
)->($_)
),
1 .. 6),
center => sub {
foreach my $child ($_->children('p')) {
# XXX - revert other blocklevel to left/original alignment
$child->set_att(align => 'center');
}
$_->erase;
},
sup => sub {
my $digits = $_->text;
my $text = '';
$text .= +SuperScript->[$1] while $digits =~ s/(\d)//;
$_->set_text($text);
$_->erase;
},
sub => sub {
my $digits = $_->text;
my $text = '';
$text .= +SubScript->[$1] while $digits =~ s/(\d)//;
$_->set_text($text);
$_->erase;
},
u => sub {
_set(underline => 1, $_);
$_->erase;
},
em => sub {
_set(font => $FontOblique, $_);
$_->erase;
},
i => sub {
_set(font => $FontOblique, $_);
$_->erase;
},
strong => sub {
_set(font => $FontBold, $_);
$_->erase;
},
b => sub {
_set(font => $FontBold, $_);
$_->erase;
},
div => sub {
if (my $tag = (_type(header => $_) || _type(footer => $_))) {
$_->set_tag($tag);
$_->set_att(
"${tag}_height" => int(
sum(
$LineHeight * 2,
grep defined,
map $_->att('h'),
$_->descendants
)
),
);
}
else {
$_->erase;
}
},
hr => sub {
$_->insert_new_elt(first_child => (_type(pagebreak => $_) || 'hr'));
$_->erase;
},
img => sub {
my $src = $_->att('src');
my $file = File::Spec->rel2abs($src);
if ($src =~ m{^(\w+):/}) {
require LWP::Simple;
require File::Basename;
require File::Spec;
$file =
File::Spec->catfile(File::Spec->tmpdir,
File::Basename::basename($src));
LWP::Simple::mirror($src => $file);
}
# CSA - check for real file first
#
if (-e $file) {
my $w = $_->att('width');
my $h = $_->att('height');
if (($w eq '') or ($h eq '')) {
require Image::Size;
my ($iw, $ih) = Image::Size::imgsize($file);
# CSA - catch this now, before we crash
#
warn "unable to read image file '$file' ($w x $h)"
unless (defined $iw && defined $ih);
$iw ||= 1;
$ih ||= 1;
if (!$w and !$h) {
($w, $h) = ($iw, $ih);
}
elsif (!$w) {
$w = $iw * ($h / $ih);
}
else {
$h = $ih * ($w / $iw);
}
}
my $image = $_->insert_new_elt(
first_child => image => {
filename => $file,
w => ($w / $PageWidth * $PageResolution),
h => ($h / $PageWidth * $PageResolution),
type => '',
}
);
$image->wrap_in('row');
# CSA - File has gone missing
#
}
else {
warn "image file '$file' does not exist";
}
$_->erase;
},
body => sub {
# XXX make pagedef into parameters
if ($Landscape) {
require PDF::Template;
$PageSize = 'A4LANDSCAPE';
$PDF::Template::Constants::Verify{PAGESIZE}{__DEFAULT__} =
'A4LANDSCAPE';
$PDF::Template::Constants::Verify{PAGESIZE}{A4LANDSCAPE} = {
PAGE_WIDTH => 842,
PAGE_HEIGHT => 595,
};
}
$_->wrap_in(
pagedef => {
pagesize => $PageSize,
landscape => $Landscape,
margins => $LineHeight - 2,
},
);
$_->wrap_in(
font => {
face => $Font,
h => $LineHeight - 2,
}
);
my $pagedef = $_->parent->parent;
my $head = ($pagedef->descendants('header'))[0]
|| $pagedef->insert_new_elt(
first_child => header => { header_height => $LineHeight * 2 });
my $row = $head->insert_new_elt(first_child => 'row');
$row->insert_new_elt(
first_child => textbox => { w => '100%', text => '' });
foreach my $child ($_->children('#PCDATA')) {
$child->set_text(
join(' ', grep length, split(/\n+/, $child->text)));
if ($child->text =~ /[^\x00-\x7f]/) {
$child->wrap_in(font => { face => $FontUnicode });
}
$child->wrap_in('row');
$child->wrap_in(textbox => { w => '100%' });
$child->insert_new_elt(after => 'textbox')->wrap_in('row');
}
$_->erase;
},
p => \&_p,
li => \&_p,
table => sub {
our @RowSpan = ();
$_->root->del_att('#widths');
$_->insert_new_elt(last_child => row => { h => $LineHeight });
$_->erase;
},
ol => sub {
my $count = 1;
foreach my $child ($_->descendants('counter')) {
$child->set_tag('textbox');
$child->set_text("$count. ");
$count++;
}
$_->insert_new_elt(last_child => row => { h => $LineHeight });
$_->erase;
},
br => sub {
$_->insert_new_elt(last_child => row => { h => $LineHeight });
$_->erase;
},
ul => sub {
foreach my $child ($_->descendants('counter')) {
$child->set_tag('textbox');
$child->set_text("* ");
}
$_->insert_new_elt(last_child => row => { h => $LineHeight });
$_->erase;
},
dl => sub {
foreach my $child ($_->descendants('counter')) {
$child->delete;
}
$_->insert_new_elt(last_child => row => { h => $LineHeight });
$_->erase;
},
tr => sub {
return $_->erase if $_->descendants('row');
our @RowSpan;
my @children = $_->descendants('textbox');
my @cells = @{ shift(@RowSpan) || [] };
foreach my $i (1 .. $#cells) {
my $cell = $cells[$i] or next;
my $child;
if ($child = $children[ $i - 1 ]) {
$child->insert_new_elt(before => 'textbox', $cell);
}
elsif ($child = $children[ $i - 2 ]) {
$child->insert_new_elt(after => 'textbox', $cell);
}
else {
next;
}
@children = $_->descendants('textbox');
}
my $cols = sum(map { $_->att('colspan') || 1 } @children);
# print STDERR "==> Total cols: $cols :".@children.$/;
my $widths = $_->root->att('#widths') || [];
my $width = $_->att('w')
|| ($widths->[ $_->pos ] ||=
(int(_percentify($_->parent('table')->att('width')) / $cols)));
my $sum = 100;
my $last_child = pop(@children);
foreach my $child (@children) {
my $w = ($width * ($child->att('colspan') || 1));
$child->set_att(w => "$w%");
$sum -= $w;
}
$last_child->set_att(w => "$sum%") if $last_child;
$_->set_tag('row');
$_->set_att(lmargin => '3');
$_->set_att(rmargin => '3');
$_->set_att(border => $_->parent('table')->att('border'));
},
td => \&_td,
th => \&_td,
font => sub {
$_->del_att('face');
if ($_->att_names) {
$_->set_att(face => $Font);
$_->erase; # XXX
}
else {
$_->erase;
}
},
var => sub {
# XXX - Proper variable support
},
_default_ => sub {
$_->erase if +IgnoreTags->{ $_->tag };
$_->delete if +DeleteTags->{ $_->tag };
}
},
pretty_print => 'indented',
empty_tags => 'html',
start_tag_handlers => {
_all_ => sub {
if (my $w = $_->att('width') and 0) {
$_->set_att(w => $w);
my $widths = $_->root->att('#widths') || [];
$widths->[ $_->pos ] = $w;
$_->root->set_att('#widths' => $widths);
}
if (my $h = $_->att('size')) {
$_->set_att(h => $LineHeight + (2 * ($h - 4)));
}
if (my $bgcolor = $_->att('bgcolor')) {
$_->set_att(bgcolor => _to_color($bgcolor));
}
$_->del_att(
qw(
color bordercolor bordercolordark bordercolorlight
cellpadding cellspacing size href
)
);
},
}
);
sub _set {
my ($key, $value, $elt) = @_;
my $att = $elt->root->att("#$key") || {};
$att->{ $elt->parent } = $value;
$elt->root->set_att("#$key", $att);
}
sub _get {
my ($key, $elt) = @_;
my $att = $elt->root->att("#$key") || {};
return $att->{$elt};
}
sub _p {
my @children;
foreach my $child ($_->children) {
+InlineTags->{ $child->tag } or last;
push @children, $child->cut;
}
if (@children) {
my $textbox = $_->insert_new_elt(
before => textbox => {
w => (($_->tag eq 'p') ? '100%' : '97%'),
align => $_->att('align')
},
);
$textbox->wrap_in('row');
if ($_->tag eq 'li') {
$textbox->insert_new_elt(
before => counter => { w => '3%', align => 'right' });
}
foreach my $child (@children) {
$child->paste(last_child => $textbox);
$child->set_text(
join(' ',
grep { length and $_ ne 1 } split(/\n+/, $child->text))
);
}
my $font = _get(font => $_);
if ($textbox->text =~ /[^\x00-\x7f]/) {
$font = $FontUnicode;
}
elsif ($_->parent('i') and $_->parent('b')) {
$font ||= $FontBoldOblique;
}
elsif ($_->parent('i')) {
$font ||= $FontOblique;
}
elsif ($_->parent('b')) {
$font ||= $FontBold;
}
my %attr;
$attr{face} = $font if $font;
if (_get(underline => $_)) {
my $align = $textbox->att('align');
$align .= '_underline';
$textbox->del_att('align');
require PDF::Template::Constants;
$PDF::Template::Constants::Verify{ALIGN}{$align} = 1
if %PDF::Template::Constants::Verify;
$attr{align} = $align;
}
$textbox->wrap_in('font' => \%attr) if %attr;
}
$_->insert_new_elt(first_child => 'textbox')->wrap_in('row')
if $_->tag eq 'p';
$_->erase;
}
sub _td {
return $_->erase if $_->descendants('row');
$_->set_tag('textbox');
if (my $font = _get(font => $_)) {
$_->wrap_in(font => { face => $font });
}
my $cols = $_->parent->att('_cols');
$cols += ($_->att('colspan') || 1);
$_->parent->set_att(_cols => $cols);
if (my $rowspan = $_->att('rowspan')) {
# ok, we can't really do this.
# what we can do, though, is to add 'fake' cells in the next row.
our @RowSpan;
foreach my $i (1 .. ($rowspan - 1)) {
$RowSpan[$i][$cols] = $_->atts;
}
}
}
sub _percentify {
my $num = shift || '100%';
return $1 if $num =~ /(\d+)%/;
return int($num / $PageWidth * 100);
}
sub _type {
my ($val, $elt) = @_;
return first { $_ eq $val } grep defined, map $elt->att($_), qw(type class);
}
sub _to_color {
my ($color) = @_;
if ($color !~ s/^#//) {
$color = Graphics::ColorNames->new('Netscape')->hex($color);
}
return join ',', hex2tuple($color);
}
1;
=head1 AUTHORS
Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>
=head1 COPYRIGHT
Copyright 2004, 2005 by Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>.
This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
See L<http://www.perl.com/perl/misc/Artistic.html>
=cut