package AI::Categorizer::Collection::SingleFile;
use strict;
use AI::Categorizer::Collection;
use base qw(AI::Categorizer::Collection);
use Params::Validate qw(:types);
__PACKAGE__->valid_params
(
path => { type => SCALAR|ARRAYREF },
categories => { type => HASHREF|UNDEF, default => undef },
delimiter => { type => SCALAR },
);
__PACKAGE__->contained_objects
(
document => { class => 'AI::Categorizer::Document::Text',
delayed => 1 },
);
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
$self->{fh} = do {local *FH; *FH}; # double *FH avoids a warning
# Documents are contained in a file, or list of files
$self->{path} = [$self->{path}] unless ref $self->{path};
$self->{used} = [];
$self->_next_path;
return $self;
}
sub _next_path {
my $self = shift;
close $self->{fh} if $self->{cur_file};
push @{$self->{used}}, shift @{$self->{path}};
$self->{cur_file} = $self->{used}[-1];
open $self->{fh}, "< $self->{cur_file}" or die "$self->{cur_file}: $!";
}
sub next {
my $self = shift;
my $fh = $self->{fh}; # Must put in a simple scalar
my $content = do {local $/ = $self->{delimiter}; <$fh>};
if (!defined $content) { # File has been exhausted
unless (@{$self->{path}}) { # All files have been exhausted
$self->{fh} = undef;
return undef;
}
$self->_next_path;
return $self->next;
} elsif ($content =~ /^\s*$self->{delimiter}$/) { # Skip empty docs
return $self->next;
}
#warn "doc is $content";
#warn "creating document=>@{[ %{$self->{container}{delayed}{document}} ]}";
return $self->create_delayed_object('document', content => $content);
}
sub count_documents {
my ($self) = @_;
return $self->{document_count} if defined $self->{document_count};
$self->rewind;
my $count = 0;
local $/ = $self->{delimiter};
my $fh = $self->{fh};
while (1) {
$count++ while <$fh>;
last unless @{$self->{path}};
$self->_next_path;
}
$self->rewind;
return $self->{document_count} = $count;
}
sub rewind {
my ($self) = @_;
close $self->{fh} if $self->{cur_file};
unshift @{$self->{path}}, @{$self->{used}};
$self->{used} = [];
$self->_next_path;
}
1;