The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/local/bin/perl

=pod
=begin reST
=begin Id
$Id: htmldiff.prl 767 2006-01-28 03:29:15Z marknodine $
Copyright (C) 2002-2005 Freescale Semiconductor, Inc.
Distributed under terms of the GNU General Public License (GPL).
=end Id

=begin Description
Description: This tool takes two HTML files and creates an HTML file
that has markings of text insertions and deletions.

=end Description
=begin Usage
Usage: htmldiff [options] file1 file2

Options:
   -b string      String to insert for backward link with -l.
                  Default: "${opt_b}"
   -d string      Deletion marking string, where "$_" specifies the
                  deleted words.
                  Default: "${opt_d}"
   -f string      String to insert for forward link with -l.
                  Default: "${opt_f}"
   -h             Prints help
   -i string      Insertion marking string, where "$_" specifies the
                  inserted words.
                  Default: "${opt_i}"
   -k             Create key describing difference format
   -l             Create links between difference regions
   -p string      Program to run for diff (program must support -D option).
                  Default: ${opt_p}
   -V             Prints version information
=end Usage
=end reST
=cut

BEGIN {
    $opt_b = '<font color=orange>&lt;</font>';
    $opt_f = '<font color=orange>&gt;</font>';
    $opt_i = '<U><FONT COLOR=#800080>$_</FONT></U>';
    $opt_d = '<S><FONT COLOR=#800000>$_</FONT></S>';
    $opt_p = 'diff';
}

# Get options and process them
use Getopt::Std;
Usage() unless getopts("b:d:f:hi:klp:V");

Usage('Id') if $opt_V;
Usage('Description') if $opt_h;

die "Wrong number of arguments" unless @ARGV == 2;

$opt_i =~ /(.*)\$_(.*)/;
($INSERT_PFX,$INSERT_SFX) = ($1, $2);
$opt_d =~ /(.*)\$_(.*)/;
($DELETE_PFX,$DELETE_SFX) = ($1, $2);

#sub INSERTION { "$INSERT_PFX$_[0]$INSERT_SFX" }
#sub DELETION { OptLString() . "$DELETE_PFX$_[0]$DELETE_SFX" }
($ctx_both, $ctx_1, $ctx_2) = (0 .. 2);

$args = join(' ',@ARGV);
$sym = "A__";
open (DIFF, "$opt_p -D $sym $args|") || die "Cannot run diff";
$context = $ctx_both;
$DIFF = 0;
if ($opt_k) {
    print "Key: ${DELETE_PFX}Deleted text.$DELETE_SFX\n";
    print "${INSERT_PFX}Inserted text.$INSERT_SFX\n";
    print "Use <A HREF='#'>$opt_b</A> and <A HREF='#__diff${\scalar($DIFF+1)}'>$opt_f</A> to jump to previous/next difference region.\n"
	if $opt_l;
    print "<hr />\n";
}
print  "<A HREF='#__diff${\scalar($DIFF+1)}' NAME='__diff$DIFF'>$opt_f</A> "
    if $opt_l;

while (<DIFF>) {
    if (/^#ifndef $sym\b/o) {
	$context = $ctx_1; next;
    }
    if (/^#ifdef $sym\b/o) {
	$context = $ctx_2; next;
    }
    if (/^#else /o) {
	$context = 3 - $context; next;
    }
    if (/^#endif /o) {
	$context = $ctx_both;
	# Process the diff
	# Get the word lists for each section
	$diff1 = join('', @diff1);
	$diff2 = join('', @diff2);
	if ($diff1 eq '') {
	    my $s = MarkNonHTML($diff2, $INSERT_PFX, $INSERT_SFX);
	    $s =~ s/($INSERT_PFX)/OptLString() . $1/eo;
	    print $s;
	}
	elsif ($diff2 eq '') {
	    my $s = MarkNonHTML($diff1, $DELETE_PFX, $DELETE_SFX);
	    $s =~ s/($DELETE_PFX)/OptLString() . $1/eo;
	    print $s;
	}
	else {
	    @words1 = GetWordList($diff1);
	    @words2 = GetWordList($diff2);
	    # Use diff to find out which words were added/deleted/changed
	    open (O1, "> /tmp/1.$$") || die "Cannot write to /tmp/1.$$";
	    print O1 join("\n",@words1),"\n";
	    close O1;
	    open (O2, "> /tmp/2.$$") || die "Cannot write to /tmp/2.$$";
	    print O2 join("\n",@words2),"\n";
	    close O2;
	    open (DIFF2, "diff /tmp/1.$$ /tmp/2.$$|");
	    # Record for each difference section the type of difference
	    # (a, c, or d), the beginning and ending words in diff2, and
	    # any non-HTML words deleted from diff1.
	    my(@deletes, @diffs);
	    while (<DIFF2>) {
		if (/^[0-9]+(?:,[0-9]+)?([acd][0-9]+(?:,[0-9]+)?)/) {
		    # A new difference section
		    push(@diffs, $1);
		    push(@deletes, []);
		}
		elsif (/^< ([^<].*)/) {
		    push(@{$deletes[$#deletes]}, $1);
		}
	    }
	    close DIFF2;
	    unlink "/tmp/1.$$";
	    unlink "/tmp/2.$$";
	    # Create the merged version.  We copy words from diff2 to
	    # $merged until we hit a difference section; for each
	    # difference section, we put the struck out section in and
	    # then mark as added any non-HTML words from diff2 for c
	    # and a type difference sections.
	    my($merged);
	    my($i);
	    my($word) = 1;
	    for ($i = 0; $i < @diffs; $i++) {
		my($diff) = $diffs[$i];
		my($deletes) = $deletes[$i];
		my($type,$start,$end) =
		    $diff =~ /([acd])([0-9]+)(?:,([0-9]+))?/;
		$start++ if $type eq 'd';
		$end = $start unless $end;
		# Copy over stuff
		while ($word < $start) {
		    if ($diff2 =~ /^(\s+)/) {
		    }
		    elsif ($diff2 =~ /^(<.*?>|[^<\s]+)/) {
			$word++;
		    }
		    else { next; }
		    $merged .= $1;
		    $diff2 = "$'";
		}
		# Mark any deletions
		my $new;
		$new = ' ' .
		    MarkNonHTML((join(' ',@$deletes)), $DELETE_PFX,
				$DELETE_SFX) . ' '
		    if @$deletes;
		if ($type =~ /[ac]/) {
		    # Mark any additions.  Copy the whole string to $add.
		    my($add);
		    while ($word <= $end) {
			if ($diff2 =~ /^(\s+)/) {
			}
			elsif ($diff2 =~ /^(<.*?>|[^<\s]+)/) {
			    $word++;
			}
			else { next; }
			$add .= $1;
			$diff2 = "$'";
		    }
		    # Now mark the non-HTML words.
		    $new .= MarkNonHTML($add, $INSERT_PFX, $INSERT_SFX);
		}
		$merged .= (($new =~ /$DELETE_PFX|$INSERT_PFX/o) ?
			    OptLString() : "") . $new;
	    }
	    # Copy the rest of the diff2 text
	    $merged .= $diff2;
	    print $merged;
	}
	undef @diff1;
	undef @diff2;
	next;
    }
    if ($context == $ctx_both) {
	print;
    }
    elsif ($context == $ctx_1) {
	push(@diff1, $_);
    }
    else {
	push(@diff2, $_);
    }
}
print "<A HREF='#__diff${\scalar($DIFF++)}' NAME='__diff$DIFF'>$opt_b</A>\n"
    if $opt_l;
print "($DIFF differences)\n" if $opt_k;

# Returns a list of words.  A word is either a set of non-space
# characters separated by spaces or a string enclosed between '<' and '>'.
# Inputs: string
# Outputs: array of words
sub GetWordList {
    $_ = $_[0];
    my(@out);
    my($last_pos) = -1;
    while ($last_pos != pos) {
	$last_pos = pos;
	next if (/\G(\s+)/gc);
	if (/\G(<.*?>)/gc || /\G([^<\s]+)/gc) {
	    push (@out, $1);
	}
    }
    return @out;
}

# Marks all non-HTML words by putting the prefix and suffix around
# them.
# Inputs: string, prefix, suffix
# Returns: string
sub MarkNonHTML {
    my($s, $pfx, $sfx) = @_;
    $s =~ s/(>|\A)(?!\s+(?:<|\Z))([^<]+)/$1$pfx$2$sfx/g;
    return $s;
}

# Returns a string that links from one difference section to the next
# Inputs: None
# Returns: string
# Uses globals: $opt_l, $DIFF
sub OptLString {
    return $opt_l ? 
	"<A HREF='#__diff${\scalar($DIFF++)}' NAME='__diff$DIFF'>$opt_b</A> <A HREF='#__diff${\scalar($DIFF+1)}'>$opt_f</A> " :
	"";
}

# This subroutine extracts and prints usage information
sub Usage {
    my ($what) = @_;
    $what = "Usage" if ! $what;
    my $mark = $what eq 'Description' ? "($what|Usage)" : $what;
    if (open(ME,$0) == 1) {
	while (<ME>) {
	    if ((/^=begin $mark/ .. /^=end $mark/) &&
		! /^=(begin|end) $mark/) {
		s/(\$\{[^\}]+\})/eval($1)/ge;
		print;
	    }
	}
	close(ME);
    }
    else {
	print STDERR "Usage not available.\n";
    }
    exit (1);
}