The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#! /usr/bin/perl
use LWP::UserAgent;
require HTML::Parse;
require HTML::FormatText;
use HTML::Entities;
use Getopt::Long;
use File::Path;
use HTTP::Date qw(time2str str2time);
#use LWP::Debug qw(+);

=head1 NAME

webmirror - Simple WWW mirror program

=head1 SYNOPSIS

  webmirror [--from URL] [--to directory] [--proxy] [--checklinks]
	    [--verbose]

=head1 DESCRIPTION

This program can be used to mirror parts of WWW servers to a local
directory.  The traversal of the remote pages is recursive, that is,
linked pages are mirrored, too, if they start with the same prefix.  For
example, if you mirror C<http://www.math.fu-berlin.de/~leitner/perl/>,
then C<http://www.math.fu-berlin.de/~leitner/perl/webmirror> would be
mirrored, too, but C<http://www.math.fu-berlin.de/~leitner/> would not.
These are the meanings of the options:

=over 4

=item --from <URL>

Where to mirror from.  This has to be an HTTP URL.

=item --to <directory>

Where to put the mirrored files.

=item --proxy

Use the WWW proxy settings from the environment (http_proxy, no_proxy).
You should not use this if your proxy is a caching proxy, only if it is
a firewall proxy.  Otherwise you may mirror old versions of the pages.

=item --checklinks

Check not only links that have the same prefix but one level of other
links, too.  This is very useful for checking your own pages for invalid
outbound links.

=item --verbose

Print debugging information.

=back

=head1 DIAGNOSTICS

webmirror normally prints only errors, for example:

  Getting http://www.math.fu-berlin.de/~leitner/mutt/idnex.html
  Error: 404 not found
  Linked from: http://www.math.fu-berlin.de/~leitner/index.html

=head1 SEE ALSO

L<lwp-mirror>, L<LWP>, L<lwp-request>, L<mirror>

=head1 COPYRIGHT

webmirror is Copyright (c) 1996 Felix von Leitner. All rights reserved.
libwww-perl is Copyright (c) 1995, 1996 Gisle Aas. All rights reserved.

This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=cut

&GetOptions(
	"from=s" => \$from,
	"to=s" => \$to,
	"proxy!" => \$proxy,
	"checklinks!" => \$checklinks,
	"verbose!" => \$verbose);
if ($#ARGV==1) {
  $from = shift(@ARGV);
  $to = shift(@ARGV);
}
if ($#ARGV==0) {
  if (length($from)) {
    $to = shift(@ARGV);
  } else {
    $from = shift(@ARGV);
  }
}
if (not defined $from) {
  print "Syntax: $0 [--from URL] [--to DIRECTORY] [--proxy] [--verbose] [--checklinks]\n";
  exit 0;
}
$to =~ s/^\~/$ENV{HOME}/;
#$baseurl=$from;
$ua = new LWP::UserAgent;
$ua->agent("Fefe-Mini-Mirror/0.1 " . $ua->agent);
$ua->env_proxy if ($proxy);
push @links,$from;
push @linkedfrom,"command line";
$num=0;
while ($url = shift @links) {
  $linkedfrom = shift @linkedfrom;
  next if (exists $history{$url});
  $history{$url}=1;
  print STDERR "Getting $url\n" if ($verbose);
  $req = new HTTP::Request 'GET' => $url;
  if (not defined $baseurl) {
    $baseurl = $url;
    $baseurl =~ s,[^/]+$,, if ($baseurl =~ m,[^/]$,);
    print "Setting baseurl to $baseurl\n" if ($verbose);
  }
  if ($url =~ m,^$baseurl,) {
    my $rel=$url; $rel=~s/^$baseurl//;
    $rel .= "/index.html" if ($rel=~m,/$, or $rel eq "");
# print STDERR "  relative URL is $rel\n";
    $dest = "$to/$rel"; $dest =~ s,//,/,g;
# print STDERR "  destination is $dest\n";
    if (($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
	$atime,$mtime,$ctime,$blksize,$blocks) = stat($dest)) {
      $req->header('If-Modified-Since', time2str($mtime));
      print "  If-Modified-Since: ",time2str($mtime),"\n" if ($verbose);
    }
  }
  $res = $ua->request($req);  $success=$res->is_success;
  if (not $url =~ m,^$baseurl,) {
    if ($res->content =~ m/\<(moved|please update)\>/) {
      print STDERR "Getting $url\n" if (not $verbose);
      print STDERR "  Error: page moved!\n";
      print STDERR "  Linked from: $linkedfrom\n";
      next;
    }
    next if ($success);
  }
  if ($res->code == 500 or $res->code == 304) {	# Kludge!
    $success=1; $needtosave=0;
    open FILE,$dest || die;
    read FILE,$content,10*1024*1024;
    close FILE;
    $base="$baseurl$rel";
    #print "  Base: $base\n";
    $mimetype="unknown";
    $mimetype="text/html" if ($dest =~ m/\.html?$/);
#    print STDERR "Content-Type: $mimetype\n";
  } else {
    $content=$res->content;
    $needtosave=1;
    $mimetype=$res->header('Content-type');
#    print STDERR "Content-Type: $mimetype\n";
    $time=$res->header('Last-Modified');
    if (defined $time) {
      $time=str2time($time);
#     print "  Last-Modified: $time\n" if ($verbose);
    }
  }
  if ($success) {
    my $rel=$url; $rel=~s/^$baseurl//;
    $rel .= "/index.html" if ($rel=~m,/$, or $rel eq "");
    $dest = "$to/$rel"; $dest =~ s,//,/,g;
    #print STDERR "  relative URL is $rel\n";
    { my $localdest=$dest;
      $localdest =~ s,^$ENV{HOME},~,;
      print STDERR "  Saving to $localdest\n" if ($verbose and $needtosave);
    }
    {
      my $path=$dest; $path =~ s,/[^/]+$,,;
      mkpath($path);
    }
    if ($needtosave) {
      {
        my @tags = split(/</,$content);
	foreach $i (@tags) {
	  $i =~ s,$baseurl/*,/,;
	  $i =~ s,="",="index.html",;
	  $i =~ s,=(\s|>),=index.html$1,;
	}
	$content = join('<',@tags);
      }
      open FILE,">$dest" || die;
      print FILE $content;
      close FILE;
      utime $time,$time,$dest;
      $base = $res->base;
    } else {
      print "  Not modified, not saving\n" if ($verbose);
    }
    my $html = HTML::Parse::parse_html($content);
#    print "  Base: $base\n";
    print STDERR "  Content-Type: $mimetype\n" if ($verbose);
    if ($mimetype =~ m,^text/html$,) {
      for ( @{ $html->extract_links } ) {
	my($link, $elem) = @$_;
	my $tag = uc $elem->tag;
#	print "  Link: $link -> \n";
	$link = new URI::URL $link, $res->base;
#	print $link->abs->as_string,"\n";
#	print STDERR "  found ",$link->abs->as_string,"\n";
	my $Link = $link->abs->as_string;
	$Link =~ s/#.*$//;
	if ($Link =~ m/$baseurl/ or $checklinks) {
	  next if ($Link =~ m/^(mailto|news):/i);
	  push @links,$Link;
	  push @linkedfrom,$url;
	}
      }
    }
#  print content;
  } else {
    print STDERR "Getting $url\n" if (not $verbose);
    print STDERR "  Error: " . $res->code . " " . $res->message,"\n";
    print STDERR "  Linked from: $linkedfrom\n";
  }
}