The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
no locale;
use Config;
# vim: set sw=4 ts=4 si et:
use File::Basename qw(basename dirname);
chdir(dirname($0));
($file = basename($0)) =~ s/\.PL$//;
$file =~ s/\.pl$//
        if ($Config{'osname'} eq 'VMS' or
            $Config{'osname'} eq 'OS2');  # "case-forgiving"
open OUT,">$file" or die "Can't create $file: $!";
chmod(0755, $file);
print "Extracting $file (with variable substitutions)\n";

my $VERSION="unknown";
if (-r "../TagReader.pm"){ # get version
    open(F,"../TagReader.pm")||die;
    while(<F>){
        if (/\$VERSION *= *(.+)/){
            $VERSION=$1;
            $VERSION=~s/[^\.\d]//g;
        }
    }
    close F;
}

print OUT "$Config{'startperl'} -w
my \$VERSION = \"$VERSION\";
";


while(<DATA>){
        print OUT;
}

__END__ 
# vim: set sw=4 ts=4 si et:
# Copyright: GPL, Author: Guido Socher
#
no locale;
use strict;
use vars qw($opt_l $opt_a $opt_h $opt_A $opt_W $opt_i);
use Getopt::Std;
use HTML::TagReader;
#
sub help();
sub showonefile($);
sub linktype($);
sub dirname($);
sub flattenpath($);
#
my @possibilities= qw(index.html index.htm index.shtml default.htm index.php3 index.php index.wml);
my %killduplicate;
#
getopts("haAlWi:")||die "ERROR: No such option. -h for help.\n";
help() if ($opt_h);
help() unless ($ARGV[0]);
if ($opt_a){
    $opt_l=1;
}
if ($opt_i){
    @possibilities=split(/,/,$opt_i);
}
for my $f (@ARGV){
    if ( -r "$f" ){
        showonefile($f);
    }else{
        warn "ERROR: can not read $f\n";
    }
}
# 
# list tags in exactly one file 
#
sub showonefile($){
    my $infile=shift;
    my @tag;
    my $linktype; # one of: name href src background
    my ($waitclosetag,$origtag,$aline,$atag,$path,$tmp,$ltype,$line,$waitcount,$dir,$ckpath,$cpos);

    my $p=new HTML::TagReader "$infile";
    print "$infile\n" unless($opt_l);
    $waitclosetag=0;
    $waitcount=0;
    $atag="";
    while(@tag = $p->getbytoken(!$opt_W)){
        # read out the tags, note something like: 
        # <a name="xxxx" href="...."> .... </a>
        # is valid
        #
        $origtag=$tag[0];
        if($tag[1] eq "" || $tag[1] eq "!--"){ # not a tag, but text or a comment
            if ($waitclosetag && $opt_a){
                $atag.=$tag[0];
            }
            next;
        } 
        if ($waitclosetag){
            $tag[0]=~s/\s+/ /g; # kill newline and double space
            $atag.=$tag[0];
            $waitcount++;
            if ($tag[1] eq "/a"){
                $waitclosetag=0;
                print "$infile:$aline: $atag\n";
                $atag="";
                next;
            }
            if ($waitcount > 10){
                print STDERR "$infile:$aline: Warning \"<a ...\" starting not terminated\n" unless($opt_W);
                $waitclosetag=0;
                $waitcount=0;
                if ($opt_l){
                    print "$infile:$aline: $atag\n";
                }else{
                    print "$atag\n";
                }
                $atag="";
            }
        }
        # we search for " href", " src", " background": 
        unless($tag[0]=~/\ssrc|\shref|\sbackground/i){
            next;
        } 
        $line=$tag[2];
        $cpos=$tag[3];
        $tag[0]=~s/\s+/ /g; # kill newline and double space
        next unless ($tag[0]=~/ (href|src|background) ?=/i);
        #
        if ($tag[0]=~/ (href|src|background) ?= ?([^ >]+)/i){
            $linktype=$1;
            $path=$2;
            $path=~s/[\'\"]//g;
            $ltype=linktype($path);
            if (! $opt_A ){
                next if ($ltype ne "rel");
            }
            if ($tag[1] eq "a" && $opt_a ){
                if ($waitclosetag){
                    print STDERR "Warning: $infile:$line:$cpos: \"<a ...\" starting at line $aline not terminated\n" unless($opt_W);
                    print "$infile:$aline: $atag\n";
                }
                # wait for closing "</a>"
                $waitcount=0;
                $aline=$line;
                $waitclosetag=1;
                $atag=$tag[0];
            }else{
                # print now
                if ($opt_l){
                    print "$infile:$line:$cpos: $tag[0]\n";
                }else{
                    # chop off the anchors unless -l was given:
                    # index.html#chapter1 should be index.html otherwise
                    # we can not use it with tar:
                    $path=~s/\#.*$//;
                    # decode URL encoding:
                    $path=~tr/+/ /;
                    $path=~s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
                    # now try to expand links on directories with
                    # the appropriate index file:
                    $dir=dirname($infile);
                    $ckpath="$dir/$path";
                    if ( -f "$ckpath"){
                        $ckpath=flattenpath($ckpath);
                        print "$ckpath\n" unless($killduplicate{$ckpath});
                        $killduplicate{$ckpath}=1;
                        next;
                    }
                    my $found =0;
                    if ( -d "$ckpath"){
                        # remove optional '/'
                        $path=~s/\/$//;
                        if ($path eq "."){ # remove single '.'
                            $ckpath=$dir;
                            $path="";
                        }else{
                            $ckpath="$dir/$path";
                        }
                        for my $i (@possibilities){
                            if ( -f "$ckpath/$i"){
                                # found:
                                $ckpath="$ckpath/$i";
                                $ckpath=flattenpath($ckpath);
                                print "$ckpath\n" unless($killduplicate{$ckpath});
                                $killduplicate{$ckpath}=1;
                                $found=1;
                                last;
                            }
                        }
                    }
                    if(!$found){
                        if ($ltype eq 'rel'){
                            $ckpath="$dir/$path";
                            $ckpath=flattenpath($ckpath);
                            print "$ckpath\n" unless($killduplicate{"$ckpath"});
                            $killduplicate{"$ckpath"}=1;
                        }else{
                            $path=flattenpath($path);
                            print "$path\n" unless($killduplicate{$path});
                            $killduplicate{$path}=1;
                        }
                    }
                }
            }
        }else{
            print STDERR "$infile:$line:$cpos: Warning, broken link $tag[0]\n" unless($opt_W);
        }
    }
}
#----------------------------------
#  remove .. in a path, by compensating it with previous path components. 
# /zz/../xx becomes /xx
# /zz/../../xx becomes ../xx
# /yy/zz/vv/../../xx becomes /yy/xx
sub flattenpath($){
    my $p=shift;
    if ($p=~/^\w+:\/\//){
        # something like http://...
        return($p);
    }
    # change x/./y to x/y
    $p=~s!/\./!/!g;
    # change ./yy/a -> yy/a
    $p=~s!^\./!!g;
    return($p) unless($p=~m!/!);
    # the first component is empty if path starts with a slash
    my @pstack=split(/\//,$p);
    my $i = $#pstack-1;
    my $rmcnt=0;
    my @valcomp; # components we keep
    my $result="";
    while ($i>=0){
        if ($pstack[$i] eq '..'){
            $rmcnt++;
        }else{
            if ($rmcnt && $pstack[$i]){
                # compensate this component for ..
                $rmcnt--;
            }else{
                unshift(@valcomp,$i);
            }
        }
        $i--;
    }
    if (!$pstack[0]){
        # first empty component -> leading slash
        $result='/';
        shift @valcomp;
    }
    # non compensatable "..":
    while ($rmcnt){
        $result .= '../';
        $rmcnt--;
    }
    for $i (@valcomp){
        $result .= $pstack[$i].'/'; 
    }
    $result .= $pstack[$#pstack];
    return($result);
}
#----------------------------------
# get the directory name from file name
sub dirname($){
    my $f=shift;
    if ($f=~m=/=){
        $f=~s=/[^/]*$==;
        return("$f");
    }else{
        return(".");
    }
}
#----------------------------------
# find out if this is an abs link (proto://, file:, rel, /absfile)
sub linktype($){
    my $pathstr=shift; # no quotes must be arround the path
    if ($pathstr=~ m=^/=){
        return('/absfile');
    }elsif ($pathstr=~ m=^\.=){
        return('rel'); # may still contain a ref to named anchor
    }elsif ($pathstr=~ m=^\#=){
        return('anchor'); # relative anchor in the same file!
    }elsif ($pathstr=~ m=^file:=i){
        return('file:');
    }elsif ($pathstr=~ m=^\w+://=i){
        return('proto://');
    }elsif ($pathstr=~ m=^\w+:=i){
        return('proto:'); # mailto: or javascript:
    }else{
        return('rel');
    }
}
#----------------------------------
sub help(){
print "tr_llnk -- list links in html files

USAGE: tr_llnk [-h] [-AalW] [-i fname-list] html-files

Without option -A tr_llnk will list only relative links.
The idea of this program is that you run 
tar cvf pages.tar `tr_llnk index.html something.html`
in order to include files that are referenced by index.html and
something.html. 

With the option -A you can use tr_llnk to inspect the links in
html files.

OPTIONS: 
         -h this help

         -a list for <a href=...> tags everything until </a>
         This option implies (-l). You can not use this option
         in combination with tar to pack files together. This is 
         more to look at the links of a webpage.

         -A list all href=...,  src=..., and background=... tags
         whether they are relative or not.

         -i index list. tr_llnk expands links on directories when possible
         by adding the default index file. E.g \"href=../\" becomes
         \"href=../index.html\". This option specifies the search order
         for the index file. It is a comma seperated list of files.
         Default (without this option):
         index.html,index.htm,index.shtml,default.htm,index.php3,
         index.php,index.wml
         Note that this option is not relevant if -l or -a was specified.

         -l print with filename, line number and entire tag. Note that errors
         and warnings are always printed with filename and line numbers.
         If -l is given then the file names from the command line are
         not included unless they appear in one of the files as
         links. You can not use this in combination with tar to pack 
         files together.

         -W do not print warnings about html errors (not terminated
         tags etc ...).

EXAMPLE:
 pack a tar file:
 tar cvf pages.tar `tr_llnk index.html something.html`

 inspect a html file:
 tr_llnk -Al something.html

tr_llnk is part of the HTML::TagReader package.

version $VERSION
         \n";
exit(0);
}
__END__ 

=head1 NAME

tr_llnk -- list links in html files

=head1 SYNOPSIS

USAGE: tr_llnk [-h] [-AalW] [-i fname-list] html-files

=head1 DESCRIPTION

Without option -A tr_llnk will list only relative links.
The idea of this program is that you run 
tar cvf pages.tar `tr_llnk index.html something.html`
in order to include files that are referenced by index.html and
something.html. 

With the option -A you can use tr_llnk to inspect the links in
html files.

=head1 OPTIONS

B<-h> this help

B<-a> list for E<lt>a href=...E<gt> tags everything until E<lt>/aE<gt>
This option implies (-l). You can not use this option
in combination with tar to pack files together. This is 
more to look at the links of a web page.

B<-A> list all href=...,  src=..., and background=... tags
whether they are relative or not.

B<-i> index list. tr_llnk expands links on directories when possible
by adding the default index file. E.g "href=../" becomes
"href=../index.html". This option specifies the search order
for the index file. It is a comma seperated list of files.
Default (without this option):
index.html,index.htm,index.shtml,default.htm,index.php3,
index.php,index.wml
Note that this option is not relevant if -l or -a was specified.

B<-l> print with filename, line number and entire tag. Note that errors
and warnings are always printed with filename and line numbers.
If -l is given then the file names from the command line are
not included unless they appear in one of the files as
links. You can not use this in combination with tar to pack 
files together.

B<-W> do not print warnings about html errors (not terminated
tags etc ...).

=head1 EXAMPLE

pack a tar file:
tar cvf pages.tar `tr_llnk index.html something.html`

inspect a html file:
tr_llnk -Al something.html

=head1 AUTHOR

tr_llnk is part of the HTML::TagReader package and was written by
Guido Socher [guido(at)linuxfocus.org]

=cut