package HTML::Copy;
use 5.008;
use strict;
use warnings;
use File::Spec;
use File::Basename;
use File::Path;
#use Cwd;
use utf8;
use Encode;
use Encode::Guess;
use Carp;
#use Data::Dumper;
use HTML::Parser 3.40;
use HTML::HeadParser;
use URI::file;
use base qw(HTML::Parser Class::Accessor);
__PACKAGE__->mk_accessors(qw(link_attributes
has_base));
#use Data::Dumper;
our @default_link_attributes = ('src', 'href', 'background', 'csref', 'livesrc');
# 'livesrc' and 'csref' are uesed in Adobe GoLive
=head1 NAME
HTML::Copy - copy a HTML file without breaking links.
=head1 VERSION
Version 1.3
=cut
our $VERSION = '1.3';
=head1 SYMPOSIS
use HTML::Copy;
HTML::Copy->htmlcopy($source_path, $destination_path);
# or
$p = HTML::Copy->new($source_path);
$p->copy_to($destination_path);
# or
open my $in, "<", $source_path;
$p = HTML::Copy->new($in)
$p->source_path($source_path); # can be omitted,
# when $source_path is in cwd.
$p->destination_path($destination_path) # can be omitted,
# when $source_path is in cwd.
open my $out, ">", $source_path;
$p->copy_to($out);
=head1 DESCRIPTION
This module is to copy a HTML file without beaking links in the file. This module is a sub class of HTML::Parser.
=head1 REQUIRED MODULES
=over 2
=item L<HTML::Parser>
=back
=head1 CLASS METHODS
=head2 htmlcopy
HTML::Copy->htmlcopy($source_path, $destination_path);
Parse contents of $source_path, change links and write into $destination_path.
=cut
sub htmlcopy($$$) {
my ($class, $source_path, $destination_path) = @_;
my $p = $class->new($source_path);
return $p->copy_to($destination_path);
}
=head2 parse_file
$html_text = HTML::Copy->parse_file($source_path,
$destination_path);
Parse contents of $source_path and change links to copy into $destination_path. But don't make $destination_path. Just return modified HTML. The encoding of strings is converted into utf8.
=cut
sub parse_file($$$) {
my ($class, $source, $destination) = @_;
my $p = $class->new($source);
return $p->parse_to($destination);
}
=head1 CONSTRUCTOR METHODS
=head2 new
$p = HTML::Copy->new($source);
Make an instance of this module with specifing a source of HTML.
The argument $source can be a file path or a file handle. When a file handle is passed, you may need to indicate a file path of the passed file handle by the method L<"source_path">. If calling L<"source_path"> is omitted, it is assumed that the location of the file handle is the current working directory.
=cut
sub new {
my $class = shift @_;
my $self = $class->SUPER::new();
if (@_ > 1) {
my %args = @_;
my @keys = keys %args;
@$self{@keys} = @args{@keys};
} else {
my $file = shift @_;
if (!ref($file) && (ref(\$file) ne "GLOB")) {
$self->source_path($file);
} else {
$self->source_handle($file);
}
}
$self->link_attributes(\@default_link_attributes);
$self->has_base(0);
return $self;
}
=head1 INSTANCE METHODS
=head2 copy_to
$p->copy_to($destination)
Parse contents of $source given in new method, change links and write into $destination.
The argument $destination can be a file path or a file handle. When $destination is a file handle, you may need to indicate the location of the file handle by a method L<"destination_path">. L<"destination_path"> must be called before calling L<"copy_to">. When calling L<"destination_path"> is omitted, it is assumed that the locaiton of the file handle is the current working directory.
=cut
sub copy_to {
my ($self, $destination) = @_;
my $io_layer = $self->io_layer();
my $fh;
if (!ref($destination) && (ref(\$destination) ne "GLOB")) {
$destination = $self->set_destination($destination);
open $fh, ">$io_layer", $destination
or croak "can't open $destination.";
} else {
$fh = $destination;
binmode($fh, $io_layer);
}
$self->{'output_handle'} = $fh;
$self->SUPER::parse($self->{'source_html'});
$self->eof;
close $fh;
$self->source_handle(undef);
return $self->destination_path;
}
=head2 parse_to
$p->parse_to($destination_path)
Parse contents of $source_path given in new method, change links and return HTML contents to wirte $destination_path. Unlike copy_to, $destination_path will not created and just return modified HTML. The encoding of strings is converted into utf8.
=cut
sub parse_to {
my ($self, $destination_path) = @_;
$destination_path = $self->destination_path($destination_path);
my $output = '';
open my $fh, ">", \$output;
$self->copy_to($fh);
return Encode::decode($self->encoding, $output);
}
=head1 ACCESSOR METHODS
=head2 source_path
$p->source_path
$p->source_path($path)
Get and set a source location. Usually source location is specified with the L<"new"> method. When a file handle is passed to L<"new"> and the location of the file handle is not the current working directory, you need to use this method.
=cut
sub source_path {
my $self = shift @_;
if (@_) {
my $path = shift @_;
$self->{'source_path'} = $path;
$self->source_uri(URI::file->new_abs($path));
}
return $self->{'source_path'};
}
=head2 destination_path
$p->destination_path
$p->destination_path($path)
Get and set a destination location. Usually destination location is specified with the L<"copy_to">. When a file handle is passed to L<"copy_to"> and the location of the file handle is not the current working directory, you need to use this method before L<"copy_to">.
=cut
sub destination_path {
my $self = shift @_;
if (@_) {
my $path = shift @_;
$self->{'destination_path'} = $path;
$self->destination_uri(URI::file->new_abs($path));
}
return $self->{'destination_path'};
}
=head2 enchoding
$p->encoding;
Get an encoding of a source HTML.
=cut
sub encoding {
my ($self) = @_;
if ($self->{'encoding'}) {
return $self->{'encoding'};
}
my $in = $self->source_handle;
my $data = do {local $/; <$in>;};
my $p = HTML::HeadParser->new;
$p->utf8_mode(1);
$p->parse($data);
my $content_type = $p->header('content-type');
my $encoding = '';
if ($content_type) {
if ($content_type =~ /charset\s*=(.+)/) {
$encoding = $1;
}
}
unless ($encoding) {
my $decoder;
if (my @suspects = $self->encode_suspects) {
$decoder = Encode::Guess->guess($data, @suspects);
}
else {
$decoder = Encode::Guess->guess($data);
}
ref($decoder) or
die("Can't guess encoding of ".$self->source_path);
$encoding = $decoder->name;
}
$self->{'source_html'} = Encode::decode($encoding, $data);
$self->{'encoding'} = $encoding;
return $encoding;
}
=head2 io_layer
$p->io_layer;
$p->io_layer(':utf8');
Get and set PerlIO layer to read the source path and to write the destination path. Usualy it was automatically determined by $source_path's charset tag. If charset is not specified, Encode::Guess module will be used.
=cut
sub io_layer {
my $self = shift @_;
if (@_) {
$self->{'io_layer'} = shift @_;
}
else {
unless ($self->{'io_layer'}) {
$self->{'io_layer'} = $self->check_io_layer();
}
}
return $self->{'io_layer'};
}
=head2 encode_suspects
@suspects = $p->encode_sustects;
$p->encode_suspects(qw/shiftjis euc-jp/);
Add suspects of text encoding to guess the text encoding of the source HTML. If the source HTML have charset tag, it is not requred to add suspects.
=cut
sub encode_suspects {
my $self = shift @_;
if (@_) {
my @suspects = @_;
$self->{'EncodeSuspects'} = \@suspects;
}
if (my $suspects_ref = $self->{'EncodeSuspects'}) {
return @$suspects_ref;
}
else {
return ();
}
}
=head2 source_html
$p->source_html;
Obtain source HTML's contents
=cut
sub source_html {
my ($self) = @_;
$self->io_layer;
return $self->{'source_html'};
}
=head1 NOTE
Cleanuped pathes should be given to HTML::Copy and it's instances. For example, a verbose path like '/aa/bb/../cc' may cause converting links wrongly. This is a limitaion of the URI module's rel method. To cleanup pathes, Cwd::realpath is useful.
=head1 AUTHOR
Tetsuro KURITA <tkurita@mac.com>
=cut
##== overriding methods of HTML::Parser
sub declaration { $_[0]->output("<!$_[1]>") }
sub process { $_[0]->output($_[2]) }
sub comment { $_[0]->output("<!--$_[1]-->") }
sub end { $_[0]->output($_[2]) }
sub text { $_[0]->output($_[1]) }
sub start {
my ($self, $tag, $attr_dict, $attr_names, $tag_text) = @_;
unless ($self->has_base) {
if ($tag eq 'base') {
$self->has_base(1);
}
my $is_changed = 0;
foreach my $an_attr (@{$self->link_attributes}) {
if (exists($attr_dict->{$an_attr})){
my $link_path = $attr_dict->{$an_attr};
next if ($link_path =~ /^\$/);
my $uri = URI->new($link_path);
next if ($uri->scheme);
$is_changed = 1;
$attr_dict->{$an_attr} = $self->change_link($uri);
}
}
if ($is_changed) {
my $attrs_text = $self->build_attributes($attr_dict, $attr_names);
$tag_text = "<$tag $attrs_text>";
}
}
$self->output($tag_text);
}
##== private functions
sub complete_destination_path {
my ($self, $dir) = @_;
my $source_path = $self->source_path
or croak "Can't resolve a file name of the destination, because a source path is not given.";
my $filename = basename($source_path)
or croak "Can't resolve a file name of the destination, because given source path is a directory.";
return File::Spec->catfile($dir, $filename);
}
sub set_destination {
my ($self, $destination_path) = @_;
if (-d $destination_path) {
$destination_path = $self->complete_destination_path($destination_path);
} else {
my ($name, $dir) = fileparse($destination_path);
unless ($name) {
$destination_path = $self->complete_destination_path($destination_path);
}
mkpath($dir);
}
return $self->destination_path($destination_path);
}
sub check_io_layer {
my ($self) = @_;
my $encoding = $self->encoding;
return '' unless ($encoding);
my $io_layer = '';
if (grep {/$encoding/} ('utf8', 'utf-8', 'UTF-8') ) {
$io_layer = ":utf8";
}
else {
$io_layer = ":encoding($encoding)";
}
return $io_layer;
}
sub build_attributes {
my ($self, $attr_dict, $attr_names) = @_;
my @attrs = ();
foreach my $attr_name (@{$attr_names}) {
if ($attr_name eq '/') {
push @attrs, '/';
} else {
my $attr_value = $attr_dict->{$attr_name};
push @attrs, "$attr_name=\"$attr_value\"";
}
}
return join(' ', @attrs);
}
sub change_link {
my ($self, $uri) = @_;
my $result_uri;
my $abs_uri = $uri->abs( $self->source_uri );
my $abs_path = $abs_uri->file;
if (-e $abs_path) {
$result_uri = $abs_uri->rel($self->destination_uri);
} else {
warn("$abs_path is not found.\nThe link to this path is not changed.\n");
$result_uri = $uri;
}
return $result_uri->as_string;
}
sub output {
my ($self, $out_text) = @_;
print {$self->{'output_handle'}} $out_text;
}
sub source_handle {
my $self = shift @_;
if (@_) {
$self->{'source_handle'} = shift @_;
}
elsif (!$self->{'source_handle'}) {
my $path = $self->source_path or croak "source_paht is undefined.";
open my $in, "<", $path or croak "Can't open $path.";
$self->{'source_handle'} = $in;
}
return $self->{'source_handle'}
}
sub source_uri {
my $self = shift @_;
if (@_) {
$self->{'source_uri'} = shift @_;
} elsif (!$self->{'source_uri'}) {
$self->{'source_uri'} = do {
if (my $path = $self->source_path) {
URI::file->new_abs($path);
} else {
URI::file->cwd;
}
}
}
return $self->{'source_uri'}
}
sub destination_uri {
my $self = shift @_;
if (@_) {
$self->{'destination_uri'} = shift @_;
} elsif (!$self->{'destination_uri'}) {
$self->{'destination_uri'} = do {
if (my $path = $self->destination_path) {
URI::file->new_abs($path);
} else {
URI::file->cwd;
}
}
}
return $self->{'destination_uri'};
}
1;