The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package MARC::File::MicroLIF;

=head1 NAME

MARC::File::MicroLIF - MicroLIF-specific file handling

=cut

use strict;
use integer;
use vars qw( $ERROR );

use MARC::File;
use vars qw( @ISA ); @ISA = qw( MARC::File );

use MARC::Record qw( LEADER_LEN );

=head1 SYNOPSIS

    use MARC::File::MicroLIF;

    my $file = MARC::File::MicroLIF->in( $filename );

    while ( my $marc = $file->next() ) {
        # Do something
    }
    $file->close();
    undef $file;

=head1 EXPORT

None.

=cut


=for internal

The buffer must be large enough to handle any valid record because
we don't check for cases like a CR/LF pair or an end-of-record/CR/LF
trio being only partially in the buffer.

The max valid record is the max MARC record size (99999) plus one
or two characters per tag (CR, LF, or CR/LF).  It's hard to say
what the max number of tags is, so here we use 6000.  (6000 tags
can be squeezed into a MARC record only if every tag has only one
subfield containing a maximum of one character, or if data from
multiple tags overlaps in the MARC record body.  We're pretty safe.)

=cut

use constant BUFFER_MIN => (99999 + 6000 * 2);

=head1 METHODS

=head2 in()

Opens a MicroLIF file for reading.

=cut

sub in {
    my $class = shift;
    my $self = $class->SUPER::in( @_ );

    if ( $self ) {
        bless $self, $class;

        $self->{exhaustedfh} = 0;
        $self->{inputbuf} = '';
        $self->{header} = undef;

        # get the MicroLIF header, but handle the case in
        # which it's missing.
        my $header = $self->_get_chunk( 1 );
        if ( defined $header ) {
            if ( $header =~ /^LDR/ ) {
                # header missing, put this back
                $self->_unget_chunk( $header . "\n" );

                # XXX should we warn of a missing header?
            }
            else {
                $self->{header} = $header;
            }
        }
        else {
            # can't read from the file
            undef $self;
        }
    }

    return $self;
} # new


# fill the buffer if we need to
sub _fill_buffer {
    my $self = shift;
    my $ok = 1;

    if ( !$self->{exhaustedfh} && length( $self->{inputbuf} ) < BUFFER_MIN ) {
        # append the next chunk of bytes to the buffer
        my $read = read $self->{fh}, $self->{inputbuf}, BUFFER_MIN, length($self->{inputbuf});
        if ( !defined $read ) {
            # error!
            $ok = undef;
            $MARC::File::ERROR = "error reading from file " . $self->{filename};
        }
        elsif ( $read < 1 ) {
            $self->{exhaustedfh} = 1;
        }
    }

    return $ok;
}


=for internal

Gets the next chunk of data.  If C<$want_line> is true then you get
the next chunk ending with any combination of \r and \n of any length.
If it is false or not passed then you get the next chunk ending with
\x60 followed by any combination of \r and \n of any length.

All trailing \r and \n are stripped.

=cut

sub _get_chunk {
    my $self = shift;
    my $want_line = shift || 0;

    my $chunk = undef;

    if ( $self->_fill_buffer() && length($self->{inputbuf}) > 0 ) {

        # the buffer always has at least one full line in it, so we're
        # guaranteed that if there are no line endings then we're
        # on the last line.

        if ( $want_line ) {
            if ( $self->{inputbuf} =~ /^([^\x0d\x0a]*)([\x0d\x0a]+)/ ) {
                $chunk = $1;
                $self->{inputbuf} = substr( $self->{inputbuf}, length($1)+length($2) );
            }
        }
        else {
            # couldn't figure out how to make this work as a regex
            my $pos = -1;
            while ( !$chunk ) {
                $pos = index( $self->{inputbuf}, '`', $pos+1 );
                last if $pos < 0;
                if ( substr($self->{inputbuf}, $pos+1, 1) eq "\x0d" or substr($self->{inputbuf}, $pos+1, 1) eq "\x0a" ) {
                    $chunk = substr( $self->{inputbuf}, 0, $pos+1 ); # include the '`' but not the newlines
                    while ( substr($self->{inputbuf}, $pos+1, 1) eq "\x0d" or substr($self->{inputbuf}, $pos+1, 1) eq "\x0a" ) {
                        ++$pos;
                    }
                    # $pos now pointing at last newline char
                    $self->{inputbuf} = substr( $self->{inputbuf}, $pos+1 );
                }
            }
        }

        if ( !$chunk ) {
            $chunk = $self->{inputbuf};
            $self->{inputbuf} = '';
            $self->{exhaustedfh} = 1;
        }
    }

    return $chunk;
}


# $chunk is put at the beginning of the buffer exactly as
# passed in.  No line endings are added.
sub _unget_chunk {
    my $self = shift;
    my $chunk = shift;
    $self->{inputbuf} = $chunk . $self->{inputbuf};
    return;
}


sub _next {
    my $self = shift;

    my $lifrec = $self->_get_chunk();

    # for ease, make the newlines match this platform
    $lifrec =~ s/[\x0a\x0d]+/\n/g if defined $lifrec;

    return $lifrec;
}


=head2 header()

If the MicroLIF file has a file header then the header is returned.
If the file has no header or the file has not yet been opened then
C<undef> is returned.

=cut

sub header {
    my $self = shift;
    return $self->{header};
}

=head2 decode()

Decodes a MicroLIF record and returns a USMARC record.

Can be called in one of three different ways:

    $object->decode( $lif )
    MARC::File::MicroLIF->decode( $lif )
    MARC::File::MicroLIF::decode( $lif )

=cut

sub decode {
    my $self = shift;
    my $location = '';
    my $text = '';

    ## decode can be called in a variety of ways
    ## this bit of code covers all three

    if ( ref($self) =~ /^MARC::File/ ) {
        $location = 'in record '.$self->{recnum};
        $text = shift;
    } else {
        $location = 'in record 1';
        $text = $self=~/MARC::File/ ? shift : $self;
    }

    my $marc = MARC::Record->new();

    # for ease, make the newlines match this platform
    $text =~ s/[\x0a\x0d]+/\n/g if defined $text;

    my @lines = split( /\n/, $text );
    for my $line ( @lines ) {

        ($line =~ s/^([0-9A-Za-z]{3})//) or
            $marc->_warn( "Invalid tag number: ".substr( $line, 0, 3 )." $location" );
        my $tagno = $1;

        ($line =~ s/\^`?$//)
            or $marc->_warn( "Tag $tagno $location is missing a trailing caret." );

        if ( $tagno eq "LDR" ) {
            $marc->leader( substr( $line, 0, LEADER_LEN ) );
        } elsif ( $tagno =~ /^\d+$/ and $tagno < 10 ) {
            $marc->add_fields( $tagno, $line );
        } else {
            $line =~ s/^(.)(.)//;
            my ($ind1,$ind2) = ($1,$2);
            my @subfields;
            my @subfield_data_pairs = split( /_(?=[a-z0-9])/, $line );
            if ( scalar @subfield_data_pairs < 2 ) {
                $marc->_warn( "Tag $tagno $location has no subfields--discarded." );
            }
            else {
                shift @subfield_data_pairs; # Leading _ makes an empty pair
                for my $pair ( @subfield_data_pairs ) {
                    my ($subfield,$data) = (substr( $pair, 0, 1 ), substr( $pair, 1 ));
                    push( @subfields, $subfield, $data );
                }
                $marc->add_fields( $tagno, $ind1, $ind2, @subfields );
            }
        }
    } # for

    return $marc;
}

1;

__END__

=head1 TODO

=over 4

=back

=head1 RELATED MODULES

L<MARC::File>

=head1 LICENSE

This code may be distributed under the same terms as Perl itself.

Please note that these modules are not products of or supported by the
employers of the various contributors to the code.

=head1 AUTHOR

Andy Lester, C<< <andy@petdance.com> >>

=cut