The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Array::LineReader;

use 5.005;
use strict;
use warnings;
use Carp;
use IO::File;
use Tie::Array;
require DynaLoader;

use vars qw( @ISA $VERSION );
@ISA = qw(Tie::Array);

=head1 NAME

Array::LineReader - Access lines of a file via an array

=head1 SYNOPSIS

  use Array::LineReader;
  my @lines;
  
  # Get the content of every line as an element of @lines:
  tie @lines, 'Array::LineReader', 'filename';
  print scalar(@lines);		# number of lines in the file
  print $lines[0];		# content of the first line
  print $lines[-1];		# content of the last line
  ...
  
  # Get the offset and content of every line as array reference via the elements of @lines:
  tie @lines, 'Array::LineReader', 'filename', result=>[];
  print scalar(@lines);		# number of lines in the file
  print $lines[5]->[0],":",$lines[5]->[1];	# offset and content of the 5th line
  print $lines[-1]->[0],":",$lines[-1]->[1];	# offset and content of the last line
  ...
  
  # Get the offset and content of every line as hash reference via the elements of @lines:
  tie @lines, 'Array::LineReader', 'filename', result=>{};
  print scalar(@lines);		# number of lines in the file
  print $lines[4]->{OFFSET},":",$lines[4]->{CONTENT};	# offset and content of the 4th line
  print $lines[-1]->{OFFSET},":",$lines[-1]->{CONTENT};	# offset and content of the last line
  ...

=cut

BEGIN {
	$VERSION = sprintf("%d.%02d", q$Revision: 1.1 $ =~ /(\d+)\.(\d+)/);
}

=head1 VERSION and VOLATILITY

	$Revision: 1.1 $
	$Date: 2004/06/10 18:17:23 $

=head1 DESCRIPTION

Array::LineReader gives you the possibility to access lines of some file by the
elements of an array.
This modul inherites methods from C<Tie::Array> (see L<Tie::Array>).
You save a lot of memory, because the file's content is read only on demand, i.e. in 
the case you access an element of the array. The offset and length of all the lines 
is hold in memory as long as you tie your array.

The underlying file is opened for reading in binary mode.
(Yes, there are some OSs, that make a difference in 
interpreting the C<EOL>-sequence, i.e. C<End-Of-Line> and the C<EOF>-character, i.e.
C<End-Of-File> what is the character C<"\x1A">).
The bytes read are neigther translated nor suppressed.

Lines are build up to and including the C<EOL>-sequence.
The C<EOL>-sequence is assumed to be C<"\x0D\x0A"> or C<"\x0A\x0D"> or C<"\x0D"> or
C<"\x0A">.

The file is not closed until you C<untie> the array.

It's up to you to define the kind of access:

=head2 Access content by element

	tie @lines, 'Array::LineReader', 'filename';

You get the content of every line of the file by the elements of the array C<@lines>:

	print "@lines";

=head2 Access offset and content by array references

	tie @lines, 'Array::LineReader', 'filename', result=>[];

You get offset and content of every line of the file via the elements of the array C<@lines>:

	foreach (@lines){
		print $_->[0],":";	# offset
		print $_->[1];		# content
	}

=head2 Access offset and content by hash references

	tie @lines, 'Array::LineReader', 'filename', result=>{};

You get offset and content of every line of the file via the elements of the array C<@lines>:

	foreach (@lines){
		print $_->{OFFSET},":";	# offset
		print $_->{CONTENT};	# content
	}

=head1 METHODS

=head2 TIEARRAY

=over 4

=item TIEARRAY Create this class

Overwrites the method C<TIEARRAY> of C<Tie::Array> (see L<Tie::Array>). You never should have to call it.
It is used to create this class.

This method croaks if the C<filename> is missing.
It croaks too if the additional parameters do not have the form of a hash,
i.e. have an odd number.

The file is opened in binary mode for reading only. If the file does not exist or can not
be opened you will get an emtpy array without a warning.

The offset of every line and its length is hold in arrays corresponding to the lines of the file.
The content of a given line is read only if you access the corresponding element of the tied array.

=back

=cut

sub TIEARRAY{
	my $class = shift;
	croak "Usage: tie \@lines, 'Array::LineReader', 'filename' [, result=>type_of_result]\n" unless @_;
	my $filename = shift;
	my $self = {};
	$self->{PARMS} = {result=>''};
	croak "Odd number of parameters to be used in a hash!\n" unless scalar(@_) % 2 == 0;
	$self->{PARMS} = {@_} if @_;	# carps if odd number of parameters
	$self->{FH} = undef;
	if (-f $filename){
		$self->{FH} = new IO::File;
		$self->{FH}->open($filename) or $self->{FH} = undef;
		binmode($self->{FH}) if $self->{FH};
	}
	$self->{OFFSETS} = [0];
	$self->{LENGTHS} = [0];
	$self->{EOF} = 1 unless $self->{FH};
	bless $self, $class;
}

=head2 FETCHSIZE

=over 4

=item FETCHSIZE Define the size of the tied array.

Overwrites the method C<FETCHSIZE> of C<Tie::Array> (see L<Tie::Array>). You never should have to call it.
This method is called any time the size of the underlying array has to be defined.

The size of the tied array is defined to be the number of B<lines read so far>.

Lets have an example:

	tie @lines, 'Array::LineReader', 'filename';
	$line5 = $lines[4];	# access the 5th line.
	print scalar(@lines);	# prints: 5
	$lastline = $lines[-1];	# access the last line of the file
	print scalar(@lines);	# prints: number of lines in the file

=back

=cut

sub FETCHSIZE{
	my $self = shift;
	my $index = $#{$self->{OFFSETS}};	# use current number of elements
	while (!$self->{EOF}){ $self->_readIt($index++); }	# read until EOF
	return $index;	
}

=head2 FETCH

=over 4

=item FETCH access a specified element of the tied array

Overwrites the method C<FETCH> of C<Tie::Array> (see L<Tie::Array>). You never should have to call it.
This method is called any time you want to access a given element of the tied array.

If you access an already known element, the offset of the line to read is sought and the
line is read with the already known length.
If you access a not yet known element, the file is read up to the corresponding line.

Lets have an example:

	tie @lines, 'Array::LineReader', 'filename';
	foreach (@lines){
		print $_;	# access one line after the other
	}
	...
	print $lines[5];	# seeks the offset of the 6th line and reads it

You should use the tie command with additional parameter defining the type of the result,
if you want to have access not only to the content of a line but also to its offset.

	tie @lines, 'Array::LinesReader', 'filename', result=>{};
	print $lines[8]->{OFFSET};	# Offset of the 9th line.
	print $lines[8]->{CONTENT};	# Content of the 9th line.

or to get the offset and content by reference to an array:

	tie @lines, 'Array::LinesReader', 'filename', result=>[];
	print $lines[8]->[0];	# Offset of the 9th line.
	print $lines[8]->[1];	# Content of the 9th line.

=back

=cut

sub FETCH{
	my ($self,$index) = @_;
#	$index = scalar(@{$self->{OFFSETS}}) + $index if $index < 0;	# correct negative index
#	croak "Array index out of bounds" if $index < 0;
	if  ($index > $#{$self->{OFFSETS}}){
		$self->_readUpTo($index);
		$index = $#{$self->{OFFSETS}};
	}
	my $out = $self->_readIt($index);
	for (ref $self->{PARMS}->{result}){
		/^HASH$/  && return {OFFSET=>$self->{OFFSETS}->[$index],CONTENT=>$out};
		/^ARRAY$/ && return [$self->{OFFSETS}->[$index],$out];
	}
	return $out;
}

=head2 DESTROY

=over 4

=item DESTROY 

Overwrites the method C<DESTROY> of C<Tie:Array> (see L<Tie::Array>).
Closes the file to free some memory.

=back

=cut

sub DESTROY{
	my $self = shift;
	close $self->{FH} if $self->{FH};
}

=head2 EXISTS

=over 4

=item EXISTS

Overwrites the method C<EXISTS> of C<Tie::Array> (see L<Tie::Array>).
Returns true if the array's element was already read.

=back

=cut

sub EXISTS{
	my ($self, $index) = @_;
	return ($index >= 0) && ($index < $self->FETCHSIZE());
}

sub _readOnly{ croak "The tied array is readonly and can not be modified in any way\n"; }
*STORE = \&_readonly;
*STORESIZE = \&_readOnly;
*CLEAR = \&_readOnly;
*POP = \&_readOnly;
*PUSH = \&_readOnly;
*SHIFT = \&_readOnly;
*UNSHIFT = \&_readOnly;
*DELETE = \&_readOnly;
*EXTEND = \&_readOnly;
	
sub _readIt{
	my ($self,$index) = @_;
	my $out = undef;
	die "Invalid call of privat method _readIt" if $index > $#{$self->{OFFSETS}};
	return $out unless $self->{FH};
	seek($self->{FH}, $self->{OFFSETS}->[$index], 0);
	if ($index == $#{$self->{OFFSETS}}){
		$self->{EOF} or $out = $self->_readLine();
		if (defined $out){
			$self->{LENGTHS}->[-1] = length($out);
			push(@{$self->{OFFSETS}},tell($self->{FH}));
			push(@{$self->{LENGTHS}},0);
		}
	}else{
		$out = $self->_readLine($self->{LENGTHS}->[$index]);
	}
	return $out;
}

sub _readUpTo{
	my ($self,$index) = @_;
	return unless $self->{FH};
	return if $self->{EOF};
	die "Invalid call of privat method _readUpTo" if $index <= $#{$self->{OFFSETS}};
	for (my $idx = $#{$self->{OFFSETS}}; $idx < $index; $idx++){
		$self->_readIt($idx);
		last if $self->{EOF};
	}
}

sub _readLine{
	my $self = shift;
	my $length = shift || 0;
	my $fh = $self->{FH};
	my $line = "";
	$length && $fh->read($line, $length) && return $line;	# read $length byte if requested
		
	my $c = 0;
	while (defined ($c = $fh->getc) && $c !~ /^[\x0A\x0D]$/){
		$line .= $c;
	}
	if (!defined $c){
		$self->{EOF} = 1;
		return (length($line)) ? $line : $c;
	}
	$line .= $c;
	my $nl = $fh->getc;
	if (defined $nl && ($nl eq $c || $nl !~ /^[\x0A\x0D]$/)){
		$fh->ungetc(ord($nl));
	}elsif(defined $nl && $nl ne $c){
		$line .= $nl;
	}else{
		$self->{EOF} = 1;
	}
	return $line;
}
	
	
1;
__END__

=head1 EXPORT

None by default.

=head1 SEE ALSO

L<Array::FileReader>, L<Tie::Array>, L<Tie::File>

=head1 HISTORY

 * $Log: LineReader.pm,v $
 * Revision 1.1  2004/06/10 15:07:37  Bjoern_Holsten
 * First stable (as seems) version
 *

=head1 AUTHOR

Bjoern Holsten E<lt>bholsten + At + cpan + DoT + orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004 by Bjoern Holsten

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.6.0 or,
at your option, any later version of Perl 5 you may have available.

=cut