The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package AI::Categorizer::Collection::SingleFile;
use strict;

use AI::Categorizer::Collection;
use base qw(AI::Categorizer::Collection);

use Params::Validate qw(:types);

__PACKAGE__->valid_params
  (
   path => { type => SCALAR|ARRAYREF },
   categories => { type => HASHREF|UNDEF, default => undef },
   delimiter => { type => SCALAR },
  );

__PACKAGE__->contained_objects
  (
   document => { class => 'AI::Categorizer::Document::Text',
		 delayed => 1 },
  );

sub new {
  my $class = shift;
  my $self = $class->SUPER::new(@_);
  
  $self->{fh} = do {local *FH; *FH};  # double *FH avoids a warning

  # Documents are contained in a file, or list of files
  $self->{path} = [$self->{path}] unless ref $self->{path};
  $self->{used} = [];

  $self->_next_path;
  return $self;
}

sub _next_path {
  my $self = shift;
  close $self->{fh} if $self->{cur_file};

  push @{$self->{used}}, shift @{$self->{path}};
  $self->{cur_file} = $self->{used}[-1];
  open $self->{fh}, "< $self->{cur_file}" or die "$self->{cur_file}: $!";
}

sub next {
  my $self = shift;

  my $fh = $self->{fh}; # Must put in a simple scalar
  my $content = do {local $/ = $self->{delimiter}; <$fh>};

  if (!defined $content) { # File has been exhausted
    unless (@{$self->{path}}) { # All files have been exhausted
      $self->{fh} = undef;
      return undef;
    }
    $self->_next_path;
    return $self->next;
  } elsif ($content =~ /^\s*$self->{delimiter}$/) { # Skip empty docs
    return $self->next;
  }
#warn "doc is $content";
#warn "creating document=>@{[ %{$self->{container}{delayed}{document}} ]}";

  return $self->create_delayed_object('document', content => $content);
}

sub count_documents {
  my ($self) = @_;
  return $self->{document_count} if defined $self->{document_count};
  
  $self->rewind;

  my $count = 0;
  local $/ = $self->{delimiter};
  my $fh = $self->{fh};
  while (1) {
    $count++ while <$fh>;
    last unless @{$self->{path}};
    $self->_next_path;
  }
  
  $self->rewind;

  return $self->{document_count} = $count;
}

sub rewind {
  my ($self) = @_;

  close $self->{fh} if $self->{cur_file};
  unshift @{$self->{path}}, @{$self->{used}};
  $self->{used} = [];
  $self->_next_path;
}

1;