The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/home1/mikedlr/perl

=head1 NAME

extract-changed - extract links from files which have been changed

=head1 SYNOPSIS

extract-changed changelist

=head1 DESCRIPTION

For each file in a hierarchy, see if it is newer than the cdb file
which has the links.  If it is then extract from it and store that in
the new list (output in cdbmmake format) if it isn't just dump the old
values from the database.

=cut

use Getopt::Function; #don't need yet qw(maketrue makevalue);

$::opthandler = new Getopt::Function 
  [ "version V>version",
    "usage h>usage help>usage",
    "help-opt=s",
    "verbose:i v>verbose",
  ],  {};

$::opthandler->std_opts;

$::opthandler->check_opts;

@::changelist = @ARGV;

sub usage() {
  print <<EOF;
extract-changed [options] page-url...

EOF
  $::opthandler->list_opts;
}

sub version() {
  print <<'EOF';
extract-changed version 
$Id: extract-changed.pl.idea,v 1.4 2002/01/02 22:53:14 mikedlr Exp $
EOF
}



sort { $::a cmp $::b } @::changelist;

# we have the page index (which links on what page) open both for
# reading and writing effectively..

use CDB_File 0.86;
use CDB_File::BiIndex::Generator;

#1 tie in to the old page index

tie %::old_pages, "CDB_File", $::page_index;
my ($page,$link);

my $next_change = shift @::changelist;
OLDPAIR: while (($page,$link) = each %::old_pages ) {
  my $comp = ($next_chage cmp $page);
  $comp < 0 && do {
    $::new_pages->add($page,$link);
    $::new_links->add($link,$page);
  };
  $comp = 0 && do {
    my $ignorekey=$key;
    #chuck old values
    ($page,$link) = each %::old_pages while $key eq $ignorekey;
    #extract new values
    #ready for next change
    my $next_change = shift @::changelist;
    #start the nextpair
    redo OLDPAIR; 
  };
}


sub extract_file {
  $_=shift;
  die "$_ is a directory" if -d;

# FIXME dangerous, there could be a link to an external file which
# means the external file should be included in the infostructure
  return if -l; 

  return unless guess_media_type($_) =~ /html/;

  my $page_name="$_";
  my @linklist=();

  #1 what's the base for this file
  my $base = $File::Find::dir . '/';
  $base =~ s/^$::file_base/$::url_base/;
  my $page_url="$base$_";
  
  
  print STDERR "acting on $page_name, url $page_url\n" 
      if $::verbose & 4;

  my $act_url= sub {
    my($tag, %attr) = @_;

    #this is a closure which uses 
    #$page_name
    #$page_record

    my $linkname;
    my $attr;

    #my $linkname=$attr{'href'};
  LINK: while (($attr, $linkname)=each %attr) {
      unless ( $linkname ) {
	warn "Undefined link name generated";
	next LINK;
      }
      print STDERR "extracting link $linkname\n" if 
	  $::verbose & 8;

      #we should make sure the link name is as canonical as possible.. 
      my $url=new URI $linkname;
      $url=$url->abs($base);
      $linkname = $url->as_string();
      print STDERR "fixed up link name $linkname\n" 
	  if $::verbose & 16;

      #MAKE SURE THERE IS A LINK RECORD

      print URLLIST $linkname . "\n";
 
      push @linklist, $linkname;
    }
  }; # definition of $act_link function

  my $p = HTML::LinkExtor->new($act_url);
  unless ( eval { $p->parse_file($_) } ) {
    warn $@ ;
    return;
  }
  
  die "no url??" unless $page_url;
  $::index->add_list_first($page_url,\@linklist);
}