The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
# cmp - compare two files
# VERSION 1.0
#
# Notes:
#
# It's nice to see that, speedwise, this beats the pants off 
# GNU and IRIX system cmp in many tasks.  If compiled, this
# would probably be faster in every case.  Go Perl !
#
#
# Compat:
#
# GNU, Solaris, and IRIX cmp seem to give the EOF warning 
# whenever -l is given and file sizes are not equal.  That 
# doesn't match the BSD docs.  We follow the docs.
#
#
# Todo:
#
# support grouped single-letter opts
# revise how zero-length files are handled ?
#

END {
    close STDOUT || die "$0: can't close stdout: $!\n";
    $? = 1 if $? == 255;  # from die
}

my $chunk_size = 10_000;     # how many bytes in a gulp

my $volume=1;                # controlled by -s and -l

my $file1;
my $file2;
my $skip1;
my $skip2;
my $buffer1;
my $buffer2;
my $ino1;
my $ino2;
my $size1;
my $size2;
my $read_in1;
my $read_in2;
  
my $bytes_read = 0;
my $lines_read = 0;
my $actual_length;
my $saw_difference;

# does not yet support grouped opts
while (@ARGV) {
    my $item= shift;

    $item =~ /^--$/     and  $file1  = shift  , last;
    $item =~ /^-\?$/    and  manual();        # exits
    $item =~ /^-l$/     and  $volume = 2      , next;
    $item =~ /^-s$/     and  $volume = 0      , next;
    $item =~ /^-./      and  usage();         # exits

    $file1 = $item;
    last;
}

usage() unless @ARGV >= 1 and @ARGV <= 3;  # exits;

$file2 = shift;

(undef,$ino1,undef,undef,undef,undef,undef,$size1) = stat $file1;
(undef,$ino2,undef,undef,undef,undef,undef,$size2) = stat $file2;

exit 0 if defined $ino1 && $ino1 == $ino2;  # hopefully, on platforms where
                                            # inode is meaningless, stat 
                                            # returns undef. 

if ($size1 == 0 || $size2 == 0) {       # special handling for zero-length 
    if ($size2+$size1 == 0) {           # files.
        exit 0;
    } else {                            # Can't we say 'differ at byte zero'
                                        # and so on here?  That might make 
                                        # more sense than this behavior.
                                        # Also, this should be made consistent
                                        # with the behavior when skip >=
                                        # filesize.
        if ($volume) {
            warn "$0: EOF on $file1\n" unless $size1;
            warn "$0: EOF on $file2\n" unless $size2;
        }
        exit 1;
    }
}

if (@ARGV) {
    $skip1 = shift;    
    usage() unless $skip1 =~ /^(0x[A-F\d]+|\d+)$/;   # exits;
}

if (@ARGV) {
    $skip2 = shift;    
    usage() unless $skip2 =~ /^(0x[A-F\d]+|\d+)$/;   # exits;
}

$skip1 = eval "$skip1" if defined $skip1;
$skip2 = eval "$skip2" if defined $skip2;
 
open FILE1, "<$file1" || die "$0: cannot open $file1\n";
open FILE2, "<$file2" || die "$0: cannot open $file2\n";

sysseek FILE1, $skip1, 0 if $skip1;
sysseek FILE2, $skip2, 0 if $skip2;

READ: while (defined ($read_in1 = sysread FILE1, $buffer1, $chunk_size)) {
                      $read_in2 = sysread FILE2, $buffer2, $chunk_size;

    my $checklength = $chunk_size;
    if ($read_in1 < $chunk_size or $read_in2 < $chunk_size) {
        $checklength = ( $read_in1 < $read_in2 ?
                                     $read_in1 :
                                     $read_in2 ) - 1;  
    }

    if ($buffer1 ne $buffer2) {
        for (0..$checklength) {
            next unless (substr $buffer1,$_,1) ne (substr $buffer2,$_,1);
            my $report_lines = $lines_read + 1 +  (substr $buffer1,0,$_) =~ tr[\n][\n];
            my $report_bytes = $bytes_read + 1 + $_;
            if ($volume == 1 ) {
                print "$file1 $file2 differ: char $report_bytes, line $report_lines\n";
                exit 1; 
            } elsif ($volume eq 0) {
                exit 1;
	    } else {
                $saw_difference ||= 1; 
                printf "%6d %3o %3o\n", $report_bytes, ord(substr $buffer1,$_,1), 
                                                       ord(substr $buffer2,$_,1);
                next READ if $_ == $checklength;
                next;  
	    }
        }
    }

    $lines_read += $buffer1 =~ tr[\n][\n];
    $bytes_read += $checklength;

    if ($read_in1 < $read_in2) {
        warn "$0: EOF on $file1\n" unless $saw_difference or !$volume;
        exit 1;
    } elsif ($read_in1 > $read_in2) {
        warn "$0: EOF on $file2\n" unless $saw_difference or !$volume;	  
        exit 1;
    } elsif ($read_in1 == 0) {
        exit $saw_difference;
    }
}

close FILE1;
close FILE2;

exit $saw_difference;

sub usage {
    die "usage: $0 [-l] [-s] file1 file2 [skip1 [skip2]]\n";
}

sub manual {

    # I thought using inline docs like this was a natural extension 
    # of perlishness, like better regular expressions, but tchrist 
    # disapproves.

    # why is pod2text the only reasonable way to feed pod in 
    # on STDIN ?  Would rather use "|perldoc -", as it already
    # knows about pager, etc.

    open  MAN, "|pod2text" or die "$0: Cannot open pod2text converter\n";
    print MAN  <DATA>;
    close MAN;
    exit;
}

__END__

=head1 NAME

cmp - compare two files

=head1 SYNOPSIS

cmp B<[-l]> B<[-s]> I<file1> I<file2> I<[skip1 [skip2]]>

=head1 DESCRIPTION

I<cmp> compares two files, byte-by-byte.  The result of the comparison
is always given by the exit status, and may be summarized on the
standard output according to the options given on the command line.

If the two compared files are identical, I<cmp> will exit with
a zero exit status.  If the compared files are not identical, I<cmp>
will exit with a status of 1.

If no options are given, I<cmp> will return (on the standard output), 
the byte number and line number where the first difference is encountered.

If one file is an initial subsequence of the other, a message will
also be returned on the standard error indicating that EOF was reached
in the shorter of the two file.

I<skip1> and I<skip2> are optional byte offsets into I<file1> and
I<file2>, respectively, that determine where the file comparison
will begin.  Offsets may be given in decimal, octal, or hexadecimal 
form.  Indicate octal notation with a leading '0', and hexadecimal 
notation with a leading '0x'.

=head2 OPTIONS

=over 

=item -s

silent execution; indicate results only by exit status, suppressing
all output and warnings.

=item -l

list differences; return the byte number and the differing byte values 
for each difference between the two files.  The byte number is given
in decimal, and the byte values are given in octal.

=back

=head1 ENVIRONMENT

No environment variables affect the execution of I<cmp>.

=head1 BUGS

No known bugs.

=head1 AUTHOR   

D Roland Walker I<E<lt>walker@pobox.comE<gt>>.

=head1 COPYRIGHT and LICENSE

This program is copyright (c) D Roland Walker 1999.

This program is free and open software. You may use, modify, distribute,
and sell this program (and any modified variants) in any way you wish,
provided you do not restrict others from doing the same.