The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -s

$VERSION = '0.04';

if($h || $help || $v || $version || !$start) {
 require File::Slurp;
 File::Slurp::read_file($0) =~ /^=head1 SYNOPSIS(.*?)=head1 DESCRIPTION/ms;
 print(
qq"
This is anarch, version $VERSION. Type 'man anarch' or 'perldoc anarch'
for the manual.

Usage:$1");
 exit ;
}

use CSS::DOM 0.03;
use CSS::DOM::Parser;
use CSS::DOM::Rule 0.03 ':all';
use File::Basename;
use File::Path;
use HTML::DOM 0.025;
use LWP::UserAgent 5.815;
use URI;
use URI::file;
use WWW::Scripter;

$m = new WWW::Scripter agent => $agent = "anarch/$VERSION";
$lwp = new LWP::UserAgent agent => $agent;
env_proxy $lwp;


$root ||= ($start =~ /^(.*\/)/)[0];
$save_as = ${'save-as'};
length $save_as or $save_as = ($root =~ /([^\/]+)\/*\z/)[0];
$exclude = defined $exclude ?  qr/$exclude/ : qr/(?!)/;
$add_ext = ${'add-extensions'};

if(${'run-scripts'}) {
 $m->use_plugin(Ajax, init => sub {
  $need_doc_clone = 1;
 });
 $dom ||= ${'remove-scripts'};
 $m->add_handler(
  request_send => sub {
   push @link_accumulator, shift->uri;
   return # We must explicitly return nothing here, as LWP will think any
  }       # true value is a response object.
 );
}
else {
 $dom = 1
}

if($sync) {
 $m->add_handler(
  request_prepare => sub {
   if(defined $local_url) {
    my $req = shift;
    $real_url = $req->uri($local_url);
    delete $$req{proxy};
    $local_url = undef
   }
   return;
  }
 );
 $m->add_handler(
  response_done => sub {
   if(defined $real_url) {
    shift->request->uri($real_url);
    $real_url = undef
   }
   return;
  }
 );
}


my %link_attr = qw(
    a      href
    area   href
    body   background
    frame  src
    iframe src
    link   href
    img    src
    table  background
    tr     background
    td     background
    th     background
);

$link_attr{script} = 'src' unless ${'remove-scripts'};

my $link_tag_qr = qr/^(?:${\join '|', keys %link_attr})\z/;

fetch($start);

sub fetch {
	my ($url,$ct,$ext) = @_;
	return if $url !~ /^\Q$root\E(.*)/;
	my $save_name = $1;
	return if $visited{$url}++;

	$save_name =~ s/%([a-fA-F0-9]{2})/chr hex $1/ge;
	substr($save_name,0,0) = $save_as. '/';
	$save_name =~ s/(?<=\/)\z/index.html/;
	$save_name .= ".$ext" if $ext;

	my $syncked;
	my $caput;
	if($sync) {{
#		warn "checking $save_name for mods";
		my $mod_time = (stat $save_name)[9];
		defined $mod_time or last;
		$caput ||= $lwp->head($url);
		if( # not modified
		 $caput->is_success and $caput->last_modified == $mod_time
		) {
			# No point in checking for links in sound files
			return unless (
			 defined $ct ? $ct : ($ct = $caput->content_type)
			)  =~ '^text/';
			++$syncked;
		}
	}}
	if($syncked) {
#		warn "checking $save_name for links";
		$local_url = new_abs URI::file $save_name;
		$m->location->replace($url);
		$syncked = 0 unless $m->success;
	}
	if(!$syncked) {
		print #"\t" x $level .
		 "$url\n";

		# Check the file type before downloading it into memory. It
		# might be a huge binary file.
		$caput = $lwp->head($url);
		if($caput->content_type =~ '^text/') {
			$need_doc_clone = undef;
			$m->location->replace($url);
		}
		else { # for binary files, download directly to a file
		       # and use a handler to abort WWW::Scripter’s post-
		       # processing.
			$m->set_my_handler(
			 response_done => sub { WWW::Scripter::abort }
			);
			mkpath dirname $save_name;
			$m->get($url, ':content_file', $save_name);
			$m->set_my_handler( response_done => undef );
			return;
		}
	}
	return unless $m->success;
	
	my $is_html;
	my $content;
	my @links; @link_accumulator = ();
	if(defined $ct ? $ct eq 'text/html' : $m->is_html) {
		++$is_html; # ~~~ temporary charset hack
		my $other_doc;
		my $doc = $dom || !$need_doc_clone
		  ? $m->document
		  : do {
		     # reparse the source without script handlers
		     my $doc = $m->document;
		     my $cs = $doc->charset;
		     my $clone = new HTML::DOM
		      response => my $res = $m->res,
		      charset => $cs,
		     ;
		     $clone->write(
		      defined $cs
		       ? scalar(
		          require Encode,
		          Encode::decode $cs, $res->content
		         )
		       : $res->content
		     );
		     $m->document($clone);
		     $other_doc = $doc;
		     $clone;
		    };
		if (${'remove-scripts'}) {
			for ($doc->getElementsByTagName('*')) {
				$_->detach, $_->delete, next if tag $_ eq 'script';
				my $a = $_->attributes;
				for (map $a->item($_), 0..$a->length-1) {
					my $name = $_->name;
					$a->removeNamedItem($name) if
						$name =~ /^on/i;
				}
			}
		}
		# ~~~ temporary; doesn't work in the general case:
		my $enc_elem = $doc->createElement('meta');
		$enc_elem->attr('http-equiv', 'Content-Type');
		$enc_elem->attr('content', 'text/html; charset=utf-8');
		($doc->getElementsByTagName('head'))[0]->splice_content(0,0,
			$enc_elem);
		my $base = $doc->base;
		{
			my $base = $doc->find('base') or next;
			$base->detach;
			$base->delete
		}
		for (
		 map $_->getElementsByTagName('*'), grep $_,$doc,$other_doc
		) {
			my $tag = tag $_;
			if($tag =~ $link_tag_qr) {
				my $link = _deal_with_link(
					$_->attr($link_attr{$tag}),
					$tag =~ /^(?:body|img|td|script)\z/
						? 'pass-through' :
					$tag eq 'link' && eval{$_->sheet}
						? 'text/css' : undef,
					$url,$base,\@links
				);
				$_->attr($link_attr{$tag}=>$link);
			}
			if($tag eq 'style') {
				my $s = sheet $_;
				my $css_code = _deal_with_ss(
					$s,$url,$base,\@links
				);
				$css_code and $_->firstChild->data(
					"<!--\n$css_code-->"
				);
			}
			if(defined(my $style = $_->attr('style'))) {
				my $css_code = _deal_with_css_urls(
					$style,$url,$base,\@links
				);
				$css_code and $_->attr(style =>
					$css_code);
			}
		}
		$content = $m->content;
	}
	elsif(defined $ct ? $ct eq 'text/css' : $m->ct eq 'text/css') {
		# ~~~ What do we do about charsets?
		my $c = CSS'DOM'parse $m->res->content;
		my $css_code = _deal_with_ss(
			$c,$url,$url,\@links
		);
		$content = $css_code || $m->res->content;
	}
	else {
		$content = $m->res->content;
	}

	if(!$syncked) {
		mkpath dirname $save_name;
		my $fh;
		$is_html ? (open $fh, ">:utf8", $save_name) :
		           (open $fh, ">", $save_name) or warn <<, return;
anarch: I couldn't save $save_name for the following most unfortu-
        nate reason: $!

		print $fh $content;
		close $fh;
		if(my $mod_time = $m->res->last_modified) {
			utime $mod_time, $mod_time, $save_name;
		}
	}
	
	local $level = $level + 1;
	return if defined $depth and $level > $depth;
	push @links, @link_accumulator;
	for (@links) {
		fetch(@$_);
		$m->back;
	}
}

sub _deal_with_link{
	my($url,$type,$page_url,$base,$links) = @_;
	return unless defined $url;
	my $url = new_abs URI $url, $base;
	$url =~ /^\Q$root/ and $url !~ $exclude or return $url;
	(my $wo_hash = $url)->fragment(undef);
	push @$links, [$wo_hash,$type];
	$url =~ s/\?/%3f/;
	$url = (new URI $url)->rel($page_url);
	if($add_ext && $type ne 'pass-through') {
		my $type = $type;
		if(not $type) {
			my $caput = $lwp->head($wo_hash);
			$type = $PageTypes{$wo_hash}
			 = $caput->content_type;
		}
		use LWP::MediaTypes;
		$SuffixRe{$type} ||= do {
			my $s = join "|", media_suffix $type;
			qr/\.(?:$s)\z/
		};
		if($wo_hash !~ $SuffixRe{$type}) {
			my $suf = media_suffix $type;
			push @{$$links[-1]}, $suf;
			$url->path($url->path . ".$suf");
		}
	}
	$url =~ s/(?<=\/)\z/index.html/ # ~~~ this won’t work for foo.html#/
	 and @{$$links[-1]} = @{$$links[-1]}[0,1]; # remove the $ext if
	return $url        # it’s there, as we don’t want it for index.html
}

sub _deal_with_ss {
	my ($ss, $page_url, $base, $links) = @_;
	my $made_changes;
	for($ss->cssRules) {
		next unless $_->type == IMPORT_RULE;
		# ~~~ only works with URLs that do not contain quotes
		$_->cssText('@import "' . _deal_with_link(
			$_->href,'text/css',$page_url,$base, $links
		) . '" ' . $_->media->mediaText);
		++$made_changes;
	}
	my $css_code = join '', map $_->cssText, cssRules $ss;

	$css_code2 = _deal_with_css_urls($css_code,$page_url,$base,$links);

	return $css_code2 || $made_changes && $css_code;
}

sub _deal_with_css_urls {
	my ($css_code, $page_url, $base, $links) = @_;

	my $made_changes;

	# I’m using the undocumented innards of CSS::DOM::Parser here.
	my($types, $tokens) = CSS'DOM'Parser'tokenise $css_code;
	while($types =~ /u/g) {
		++$made_changes;
		my $url = $$tokens[my $indx = $-[0]];
		$url =~ s/^url\([ \t\r\n\f]*//;
		$url =~ s/[ \t\r\n\f]*\)\z//;
		$url =~ s/^['"]// and chop $url;
# ~~~ doesn't take escapes into account!
		$$tokens[$indx] = "url(" . _deal_with_link(
			CSS'DOM'Parser'unescape($url)
			,undef,$page_url,$base, $links
		) . ")";
	}

	return $made_changes && join '', @$tokens;	
}

=head1 NAME

anarch - A script for creating offline copies of websites

=head1 VERSION

0.03 (alpha)

=head1 SYNOPSIS

 anarch -start=http://www.example.com/some_page.html           \
      [ -root=http://www.example.com ]                         \
      [ -exclude='^http://www\.example\.com/dont-want-this/' ] \
      [ -save-as=folder\ name ]                                \
      [ -depth=5 ]                                             \
      [ -run-scripts ]                                         \
      [ -remove-scripts ]                                      \
      [ -dom ]                                                 \
      [ -sync ]                                                \
      [ -add-extensions ]

=head1 DESCRIPTION

anarch is a script for creating offline copies of websites. It downloads a
website, correcting links in pages and style sheets so that they are all 
relative (and all links outside the root directory are absolute), and
removing '<base href>' tags. It can also run scripts in pages (to find out
which files the scripts use or to save pages with generated content) and
remove scripts.

=head1 OPTIONS

=over

=item -start=...

The page to start on

=item -root=...

Only get URLs beginning with this. If this is omitted, C<-start> is used,
trimmed to the last slash.

=item -exclude=...

Regular expression for URLs to be excluded

=item -save-as=...

Where to save it. If this is omitted, the last path segment of C<-root> is
used.

=item -depth=...

How many links to follow one after the other before going back

=item -run-scripts

Run scripts in HTML pages. This will be used to find which files the 
scripts
need, so that those can be fetched as well. It is not always guaranteed that this will work, as some scripts have
absolute URLs hard-coded.

=item -remove-scripts

Remove scripts from HTML pages. This can be used in conjunction with 
C<-run-scripts> to save generated content while removing the 
scripts that
generated it. C<-remove-scripts> implies 
C<-dom>.

=item -dom

Save the HTML DOM, possibly modified by scripts or C<-remove-scripts>.
Without this, the only changes to the DOM that are saved are those made
to links to make them relative.

=item -sync

Synchronise mode: Only files that have changed since the last download will 
be
downloaded if this option is given.

=item -add-extensions

This feature is highly experimental. Use it at your own risk!

This adds extensions to files based on the MIME type, if the file does not
already have an appropriate extension. (This exists for the sake of picky
web browsers that refuse to open certain files.)

It currently has four problems with it: The start page never gets an extension added; index.html is sometimes saved as index.html.html; images
and style sheets never get extensions added, unless they are linked to
directly (via C<< <a href...> >>); and extra HEAD requests are made (far more than necessary), which slow things down to a crawl.

=back

=head1 BUGS

If you find any bugs, please e-mail the author.

This program doesn't take redirection into account.

It doesn't work with pages that have an explicit encoding in the source
code.

It doesn't work with pages not in UTF-8. (It would need to
apply a charset attribute to various elements or save the page in the
original encoding.)

Style attributes containing URLs get mangled.

When URLs in CSS style sheets are made relative, they are not properly
escaped, so quotation marks may produce invalid CSS.

The C<-run-scripts> option causes the script to eat up all your memory.

If a scripts browses to another page and C<-dom> or C<-remove-scripts> is
specified, then the wrong DOM tree is serialised.

There is no way to set the local IP address to bind to. But you can use
this nutty workaround, which requires L<Hook::WrapSub>:

  perl -sS -MHook::WrapSub -MIO::Socket::INET \
    -M'less Hook::WrapSub::wrap_subs
      sub{push @_, LocalAddr => "10.10.10.205" },
      "IO::Socket::INET::new"' \
    anarch ...

There are not enough tests yet.

If you run anarch as an argument to L<perl>, and you are using L<perl>
5.8.8 or lower, you will need to call L<perl> with the C<-s> option.

=head1 SINE QUIBUS NON

This program requires L<perl> 5.8.3 or higher (5.8.4 or higher recommended)
and the following L<CPAN|http://www.cpan.org/> modules:

L<WWW::Scripter>

L<CSS::DOM> 0.03 or higher

L<URI>

L<File::Slurp>

L<LWP> 5.815 or higher

L<HTML::DOM> 0.025 or higher

L<WWW::Scripter::Plugin::Ajax> is required for the C<-run-scripts> option
to work.

=head1 AUTHOR & COPYLEFT

Copyright (C) 2009, Father Chrysostomos (sprout at, um, cpan dot org)

This program is free software; you may redistribute or modify it (or both)
under the same terms as perl.