The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -w
# vi:et:sw=4 ts=4

=head1 NAME

marcdump - MARC record dump utility

=head1 SYNOPSIS

B<marcdump> [options] file(s)

=cut

use strict;
use integer;

use Encode;
use MARC::File;
use MARC::File::USMARC;
use MARC::Record;
use Getopt::Long;

## flag STDOUT for UTF8

my $opt_print = 1;
my $opt_hex = 0;
my $opt_quiet = 0;
my $opt_stats = 1;
my @opt_field = ();
my $opt_help = 0;
my $opt_lif = 0;

my $rc =
    GetOptions(
        "version"   => sub { print "$0, using MARC::Record v$MARC::Record::VERSION\n"; exit 1; },
        "print!"    => \$opt_print,
        "hex!"      => \$opt_hex,
        "lif!"      => \$opt_lif,
        "quiet!"    => \$opt_quiet,
        "stats!"    => \$opt_stats,
        "field=s"   => \@opt_field,
        "debug!"    => \$MARC::Record::DEBUG,
        "help"      => \$opt_help,
    );

my @files = @ARGV;
if ( $opt_help || !@files || !$rc ) {
    print <DATA>;
    exit 1;
}

my $wants_leader = grep { /LDR/ } @opt_field;

my $class = $opt_lif ? "MARC::File::MicroLIF" : "MARC::File::USMARC";
eval "require $class"; # Must be quoted to get path searching

my %counts;
my %errors;
for my $filename ( @files ) {
    $counts{$filename} = 0;
    $errors{$filename} = 0;

    warn "$filename\n" unless $opt_quiet;

    my $file = $class->in( $filename ) or die $MARC::File::ERROR;

    while ( my $marc = $file->next() ) {
        ++$counts{$filename};
        warn "$counts{$filename} records\n" if ( !$opt_quiet && ($counts{$filename} % 1000 == 0) );

        if ( @opt_field ) {
            $marc = $marc->clone( @opt_field );
            $marc->leader('') unless $wants_leader;
        }

        if ( $opt_print ) {
            if ( $opt_hex ) {
                print_hex( $marc );
            }
            else {
                # stifle warnings here in case there's utf8 data being printed
                no warnings;
                print $marc->as_formatted, "\n\n";
            }
        }

        if ( $marc->warnings() ) {
            ++$errors{$filename};
            print join( "\n", $marc->warnings(), "" );
        }
    } # while
    $file->close();
} # for

if ( $opt_stats ) {
    print " Recs  Errs Filename\n";
    print "----- ----- --------\n";
    for my $key ( sort keys %counts ) {
        printf( "%5d %5d %s\n", $counts{$key}, $errors{$key}, $key );
    } # for
} # if stats


sub print_hex {
    my $marc = shift;
    my $raw = $marc->as_usmarc();

    print "\n";

    my $offset = 0;

    # dump the leader
    my $leader = bytes::substr( $raw, 0, MARC::Record::LEADER_LEN );
    my $part1 = bytes::substr( $leader, 0, 12 );
    my $part2 = bytes::substr( $leader, 12 );
    _hex_line_output( $offset,    _to_hex($part1), _to_ascii($part1), 48 );
    _hex_line_output( $offset+12, _to_hex($part2), _to_ascii($part2), 48 );
    $offset += MARC::Record::LEADER_LEN;

    # dump the directory.  If we can't find end-of-field character that
    # follows the directory, everything following the leader (which we
    # have already dumped) will be dumped as part of the data section.
    my $dir_end = bytes::index( $raw, MARC::File::USMARC::END_OF_FIELD, MARC::Record::LEADER_LEN );
    if ( $dir_end >= 0 ) {
        for ( my $n = $offset; $n < $dir_end; $n += MARC::File::USMARC::DIRECTORY_ENTRY_LEN ) {

            my $dir_entry = bytes::substr( $raw, $n, MARC::File::USMARC::DIRECTORY_ENTRY_LEN );

            my $hex = _to_hex( $dir_entry );

            my $ascii =
                bytes::substr( $dir_entry, 0, 3 ) . ' ' .
                bytes::substr( $dir_entry, 3, 4 ) . ' ' .
                bytes::substr( $dir_entry, 7, 5 )
            ;

            _hex_line_output( $offset, $hex, $ascii, 48 );
            $offset += MARC::File::USMARC::DIRECTORY_ENTRY_LEN;
        }

        # dump the end-of-field character that follows the directory
        _hex_line_output( $offset, _to_hex(MARC::File::USMARC::END_OF_FIELD), '.', 48 );
        ++$offset;
    }

    # dump the data
    my $data_offset = 0;
    while ( $offset < bytes::length($raw) ) {
        my $chunk = bytes::substr( $raw, $offset, 16 );
        _hex_line_output( $data_offset, _to_hex($chunk), _to_ascii($chunk), 48 );
        $offset += 16;
        $data_offset += 16;
    }
}

sub _to_ascii {
    my $raw = shift;
    if ( defined $raw ) {
        $raw =~ s/[\x00-\x1f\x7f-\xff]/./g;
    }
    return $raw;
}

sub _to_hex {
    my $raw = shift;
    my $result = '';

    if ( defined $raw ) {
        for ( my $n = 0; $n < bytes::length($raw); $n++ ) {
            $result .= sprintf( '%02x ', ord(bytes::substr($raw, $n, 1)) );
        }
    }
    $result =~ s/ $//;

    return $result;
}

sub _hex_line_output {
    my $offset = shift;
    my $hex = shift;
    my $ascii = shift;
    my $width = shift;

    printf( "%05d: %-$width.${width}s %s\n", $offset, $hex, $ascii );
}


__END__
Usage: marcdump [options] file(s)

Options:
    --[no]print     Print a MicroLIF-style dump of each record
    --[no]hex       If --print active, make the output hexadecimal
    --lif           Input files are MicroLIF, not USMARC
    --field=spec    Specify a field spec to include.  There may be many.
                    Examples:
                        --field=245 --field=1XX
    --[no]quiet     Print status messages
    --[no]stats     Print a statistical summary by file at the end
    --version       Print version information
    --help          Print this summary