The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package POE::Filter::Line;

use strict;
use POE::Filter;

use vars qw($VERSION @ISA);
$VERSION = '1.364'; # NOTE - Should be #.### (three decimal places)
@ISA = qw(POE::Filter);

use Carp qw(carp croak);

sub DEBUG () { 0 }

sub FRAMING_BUFFER   () { 0 }
sub INPUT_REGEXP     () { 1 }
sub OUTPUT_LITERAL   () { 2 }
sub AUTODETECT_STATE () { 3 }
sub MAX_LENGTH       () { 4 }
sub MAX_BUFFER       () { 5 }
sub FIRST_UNUSED     () { 6 }  # First unused $self offset.

sub AUTO_STATE_DONE   () { 0x00 }
sub AUTO_STATE_FIRST  () { 0x01 }
sub AUTO_STATE_SECOND () { 0x02 }

use base 'Exporter';
our @EXPORT_OK = qw( FIRST_UNUSED );

#------------------------------------------------------------------------------

sub new {
  my $type = shift;

  croak "$type requires an even number of parameters" if @_ and @_ & 1;
  my %params = @_;

  croak "$type cannot have both Regexp and Literal line endings" if (
    defined $params{Regexp} and defined $params{Literal}
  );

  my ($input_regexp, $output_literal);
  my $autodetect = AUTO_STATE_DONE;

  # Literal newline for both incoming and outgoing.  Every other known
  # parameter conflicts with this one.
  if (defined $params{Literal}) {
    croak "A defined Literal must have a nonzero length"
      unless defined($params{Literal}) and length($params{Literal});
    $input_regexp   = quotemeta $params{Literal};
    $output_literal = $params{Literal};
    if (
      exists $params{InputLiteral} or # undef means something
      defined $params{InputRegexp} or
      defined $params{OutputLiteral}
    ) {
      croak "$type cannot have Literal with any other parameter";
    }
  }

  # Input and output are specified separately, then.
  else {

    # Input can be either a literal or a regexp.  The regexp may be
    # compiled or not; we don't rightly care at this point.
    if (exists $params{InputLiteral}) {
      $input_regexp = $params{InputLiteral};

      # InputLiteral is defined.  Turn it into a regexp and be done.
      # Otherwise we will autodetect it.
      if (defined($input_regexp) and length($input_regexp)) {
        $input_regexp = quotemeta $input_regexp;
      }
      else {
        $autodetect   = AUTO_STATE_FIRST;
        $input_regexp = '';
      }

      croak "$type cannot have both InputLiteral and InputRegexp"
        if defined $params{InputRegexp};
    }
    elsif (defined $params{InputRegexp}) {
      $input_regexp = $params{InputRegexp};
      croak "$type cannot have both InputLiteral and InputRegexp"
        if defined $params{InputLiteral};
    }
    else {
      $input_regexp = "(\\x0D\\x0A?|\\x0A\\x0D?)";
    }

    if (defined $params{OutputLiteral}) {
      $output_literal = $params{OutputLiteral};
    }
    else {
      $output_literal = "\x0D\x0A";
    }
  }

  my $max_buffer = $type->__param_max( MaxBuffer => 512*1024*1024, \%params );
  my $max_length = $type->__param_max( MaxLength => 64*1024*1024, \%params );
  croak "MaxBuffer is not large enough for MaxLength blocks"
        unless $max_buffer >= $max_length;

  delete @params{qw(Literal InputLiteral OutputLiteral InputRegexp MaxLength MaxBuffer)};
  carp("$type ignores unknown parameters: ", join(', ', sort keys %params))
    if scalar keys %params;

  my $self = bless [
    '',              # FRAMING_BUFFER
    $input_regexp,   # INPUT_REGEXP
    $output_literal, # OUTPUT_LITERAL
    $autodetect,     # AUTODETECT_STATE
    $max_length,     # MAX_LENGTH
    $max_buffer      # MAX_BUFFER
  ], $type;

  DEBUG and warn join ':', @$self;

  $self;
}


#------------------------------------------------------------------------------
# get() is inherited from POE::Filter.

#------------------------------------------------------------------------------
# 2001-07-27 RCC: Add get_one_start() and get_one() to correct filter
# changing and make input flow control possible.

sub get_one_start {
  my ($self, $stream) = @_;

  DEBUG and do {
    my $temp = join '', @$stream;
    $temp = unpack 'H*', $temp;
    warn "got some raw data: $temp\n";
  };

  $self->[FRAMING_BUFFER] .= join '', @$stream;
  die "Framing buffer exceeds the limit"
    if $self->[MAX_BUFFER] < length( $self->[FRAMING_BUFFER] );
}

# TODO There is a lot of code duplicated here.  What can be done?

sub get_one {
  my $self = shift;

  # Process as many newlines an we can find.
  LINE: while (1) {

    # Autodetect is done, or it never started.  Parse some buffer!
    unless ($self->[AUTODETECT_STATE]) {
      DEBUG and warn unpack 'H*', $self->[INPUT_REGEXP];
      last LINE
        unless $self->[FRAMING_BUFFER] =~ s/^(.*?)$self->[INPUT_REGEXP]//s;
      DEBUG and warn "got line: <<", unpack('H*', $1), ">>\n";
      my $line = $1;
      die "Next line exceeds maximum line length"
            if length( $line ) > $self->[MAX_LENGTH];

      return [ $line ];
    }

    # Waiting for the first line ending.  Look for a generic newline.
    if ($self->[AUTODETECT_STATE] & AUTO_STATE_FIRST) {
      last LINE
        unless $self->[FRAMING_BUFFER] =~ s/^(.*?)(\x0D\x0A?|\x0A\x0D?)//;

      my $line = $1;

      # The newline can be complete under two conditions.  First: If
      # it's two characters.  Second: If there's more data in the
      # framing buffer.  Loop around in case there are more lines.
      if ( (length($2) == 2) or
           (length $self->[FRAMING_BUFFER])
         ) {
        DEBUG and warn "detected complete newline after line: <<$1>>\n";
        $self->[INPUT_REGEXP] = $2;
        $self->[AUTODETECT_STATE] = AUTO_STATE_DONE;
      }

      # The regexp has matched a potential partial newline.  Save it,
      # and move to the next state.  There is no more data in the
      # framing buffer, so we're done.
      else {
        DEBUG and warn "detected suspicious newline after line: <<$1>>\n";
        $self->[INPUT_REGEXP] = $2;
        $self->[AUTODETECT_STATE] = AUTO_STATE_SECOND;
      }
      die "Next line exceeds maximum line length"
            if length( $line ) > $self->[MAX_LENGTH];

      return [ $line ];
    }

    # Waiting for the second line beginning.  Bail out if we don't
    # have anything in the framing buffer.
    if ($self->[AUTODETECT_STATE] & AUTO_STATE_SECOND) {
      return [ ] unless length $self->[FRAMING_BUFFER];

      # Test the first character to see if it completes the previous
      # potentially partial newline.
      if (
        substr($self->[FRAMING_BUFFER], 0, 1) eq
        ( $self->[INPUT_REGEXP] eq "\x0D" ? "\x0A" : "\x0D" )
      ) {

        # Combine the first character with the previous newline, and
        # discard the newline from the buffer.  This is two statements
        # for backward compatibility.
        DEBUG and warn "completed newline after line: <<$1>>\n";
        $self->[INPUT_REGEXP] .= substr($self->[FRAMING_BUFFER], 0, 1);
        substr($self->[FRAMING_BUFFER], 0, 1) = '';
      }
      elsif (DEBUG) {
        warn "decided prior suspicious newline is okay\n";
      }

      # Regardless, whatever is in INPUT_REGEXP is now a complete
      # newline.  End autodetection, post-process the found newline,
      # and loop to see if there are other lines in the buffer.
      $self->[INPUT_REGEXP] = $self->[INPUT_REGEXP];
      $self->[AUTODETECT_STATE] = AUTO_STATE_DONE;
      next LINE;
    }

    die "consistency error: AUTODETECT_STATE = $self->[AUTODETECT_STATE]";
  }

  return [ ];
}

#------------------------------------------------------------------------------
# New behavior.  First translate system newlines ("\n") into whichever
# newlines are supposed to be sent.  Second, add a trailing newline if
# one doesn't already exist.  Since the referenced output list is
# supposed to contain one line per element, we also do a split and
# join.  Bleah. ... why isn't the code doing what the comment says?

sub put {
  my ($self, $lines) = @_;

  my @raw;
  foreach (@$lines) {
    push @raw, $_ . $self->[OUTPUT_LITERAL];
  }

  \@raw;
}

#------------------------------------------------------------------------------

sub get_pending {
  my $self = shift;
  return [ $self->[FRAMING_BUFFER] ] if length $self->[FRAMING_BUFFER];
  return undef;
}

1;

__END__

=head1 NAME

POE::Filter::Line - serialize and parse terminated records (lines)

=head1 SYNOPSIS

  #!perl

  use POE qw(Wheel::FollowTail Filter::Line);

  POE::Session->create(
    inline_states => {
      _start => sub {
        $_[HEAP]{tailor} = POE::Wheel::FollowTail->new(
          Filename => "/var/log/system.log",
          InputEvent => "got_log_line",
          Filter => POE::Filter::Line->new(),
        );
      },
      got_log_line => sub {
        print "Log: $_[ARG0]\n";
      }
    }
  );

  POE::Kernel->run();
  exit;

=head1 DESCRIPTION

POE::Filter::Line parses stream data into terminated records.  The
default parser interprets newlines as the record terminator, and the
default serializer appends network newlines (CR/LF, or "\x0D\x0A") to
outbound records.

Record terminators are removed from the data POE::Filter::Line
returns.

POE::Filter::Line supports a number of other ways to parse lines.
Constructor parameters may specify literal newlines, regular
expressions, or that the filter should detect newlines on its own.

=head1 PUBLIC FILTER METHODS

POE::Filter::Line's new() method has some interesting parameters.

=head2 new

new() accepts a list of named parameters.

In all cases, the data interpreted as the record terminator is
stripped from the data POE::Filter::Line returns.

C<InputLiteral> may be used to parse records that are terminated by
some literal string.  For example, POE::Filter::Line may be used to
parse and emit C-style lines, which are terminated with an ASCII NUL:

  my $c_line_filter = POE::Filter::Line->new(
    InputLiteral => chr(0),
    OutputLiteral => chr(0),
  );

C<OutputLiteral> allows a filter to put() records with a different
record terminator than it parses.  This can be useful in applications
that must translate record terminators.

C<Literal> is a shorthand for the common case where the input and
output literals are identical.  The previous example may be written
as:

  my $c_line_filter = POE::Filter::Line->new(
    Literal => chr(0),
  );

An application can also allow POE::Filter::Line to figure out which
newline to use.  This is done by specifying C<InputLiteral> to be
undef:

  my $whichever_line_filter = POE::Filter::Line->new(
    InputLiteral => undef,
    OutputLiteral => "\n",
  );

C<InputRegexp> may be used in place of C<InputLiteral> to recognize
line terminators based on a regular expression.  In this example,
input is terminated by two or more consecutive newlines.  On output,
the paragraph separator is "---" on a line by itself.

  my $paragraph_filter = POE::Filter::Line->new(
    InputRegexp => "([\x0D\x0A]{2,})",
    OutputLiteral => "\n---\n",
  );

C<MaxBuffer> sets the maximum amount of data that the filter will hold onto 
while trying to find a line ending.  Defaults to 512 MB.

C<MaxLength> sets the maximum length of a line.  Defaults to 64 MB.

If either the C<MaxLength> or C<MaxBuffer> constraint is exceeded,
C<POE::Filter::Line> will throw an exception.


=head1 PUBLIC FILTER METHODS

POE::Filter::Line has no additional public methods.

=head1 SUBCLASSING

POE::Filter::Line exports the FIRST_UNUSED constant.  This points to
the first unused element in the $self array reference.  Subclasses
should store their own data beginning here, and they should export
their own FIRST_UNUSED constants to help future subclassers.

=head1 SEE ALSO

Please see L<POE::Filter> for documentation regarding the base
interface.

The SEE ALSO section in L<POE> contains a table of contents covering
the entire POE distribution.

=head1 BUGS

The default input newline parser is a regexp that has an unfortunate
race condition.  First the regular expression:

  /(\x0D\x0A?|\x0A\x0D?)/

While it quickly recognizes most forms of newline, it can sometimes
detect an extra blank line.  This happens when a two-byte newline
character is broken between two reads.  Consider this situation:

  some stream dataCR
  LFother stream data

The regular expression will see the first CR without its corresponding
LF.  The filter will properly return "some stream data" as a line.
When the next packet arrives, the leading "LF" will be treated as the
terminator for a 0-byte line.  The filter will faithfully return this
empty line.

B<It is advised to specify literal newlines or use the autodetect
feature in applications where blank lines are significant.>

=head1 AUTHORS & COPYRIGHTS

Please see L<POE> for more information about authors and contributors.

=cut

# rocco // vim: ts=2 sw=2 expandtab
# TODO - Edit.