The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package XML::Struct::Reader;
use strict;

use Moo;
use Carp qw(croak);
our @CARP_NOT = qw(XML::Struct);
use Scalar::Util qw(blessed);
use XML::Struct;

our $VERSION = '0.26';

has whitespace => (is => 'ro', default => sub { 0 });
has attributes => (is => 'ro', default => sub { 1 });
has path       => (is => 'ro', default => sub { '*' }, isa => \&_checkPath);
has stream     => (is => 'rw', 
    lazy    => 1,
    builder => 1,
    isa     => sub {
        die 'stream must be an XML::LibXML::Reader'
        unless blessed $_[0] && $_[0]->isa('XML::LibXML::Reader');
    }
);
has from       => (is => 'ro', trigger => 1);
has ns         => (is => 'ro', default => sub { 'keep' }, trigger => 1);
has depth      => (is => 'ro', coerce => sub {
    (defined $_[0] and $_[0] =~ /^\+?\d+/) ? $_[0] : undef
});
has deep       => (is => 'ro', default => sub { '' } );
has simple     => (is => 'ro', default => sub { 0 });
has root       => (is => 'ro', default => sub { 0 });
has content    => (is => 'ro', default => sub { 'content' });

use XML::LibXML::Reader qw(
    XML_READER_TYPE_ELEMENT
    XML_READER_TYPE_TEXT
    XML_READER_TYPE_CDATA
    XML_READER_TYPE_SIGNIFICANT_WHITESPACE
    XML_READER_TYPE_END_ELEMENT
); 

sub BUILD {
    my ($self) = @_;
    
    # make sure that option 'deep' and 'depth' are only set if it makes sense
    
    if ($self->deep eq 'simple') {
        if ($self->simple or (defined $self->depth and $self->depth == 0)) {
            # (deep = simple, simple = 1) or (deep = simple, depth = 0)
            $self->{simple} = 1;
            delete $self->{depth};
            $self->{deep} = '';
        }
    } elsif ($self->deep eq 'struct') {
        $self->{deep} = '';
    } elsif ($self->deep eq '') {
        $self->{deep} = $self->simple ? '' : 'simple';
    } elsif ($self->deep !~ /^(dom|raw)$/) {
        croak "option deep must be simple, struct, dom, or raw!"; 
    }

    if (($self->depth || 0) and $self->root and $self->simple) {
        $self->{depth} = $self->{depth}-1;
    }
}

sub _build_stream {
    XML::LibXML::Reader->new( { IO => \*STDIN } )
}
 
sub _trigger_from {
    my ($self, $from) = @_;

    unless (blessed $from and $from->isa('XML::LibXML::Reader')) {
        my %options; 

        if (ref $from and ref $from eq 'HASH') {
            %options = %$from;
            $from = delete $options{from} if exists $options{from};
        }

        if (!defined $from or $from eq '-') {
            $options{IO} = \*STDIN
        } elsif( !ref $from and $from =~ /^</ ) {
            $options{string} = $from;
        } elsif( ref $from and ref $from eq 'SCALAR' ) {
            $options{string} = $$from;
        } elsif( ref $from and ref $from eq 'GLOB' ) {
            $options{FD} = $from;
        } elsif( blessed $from and $from->isa('XML::LibXML::Document') ) {
            $options{DOM} = $from;
        } elsif( blessed $from and $from->isa('XML::LibXML::Element') ) {
            my $doc = XML::LibXML->createDocument;
            $doc->setDocumentElement($from);
            $options{DOM} = $doc;
        } elsif( blessed $from ) {
            $options{IO} = $from;
        } elsif( !ref $from ) {
            $options{location} = $from; # filename or URL
        } elsif( ! grep { $_ =~ /^(IO|string|location|FD|DOM)$/} keys %options ) {
            croak "invalid option 'from': $from";
        }
        
        $from = XML::LibXML::Reader->new( %options ) 
            or die "failed to create XML::LibXML::Reader with "
                . join(', ',map { "$_=".$options{$_} } keys %options )."\n";
    }

    $self->stream($from);
}


sub _trigger_ns {
    my ($self, $ns) = @_;

    if (!defined $ns or $ns eq '') {
        $self->{ns} = 'keep';
    } elsif ($ns !~ /^(keep|strip|disallow)?$/) {
        croak "invalid option 'ns': $ns";
    }
}


sub _checkPath {
    my $path = shift;

    die "invalid path: $path" if $path =~ qr{\.\.|.//|^\.};
    die "relative path not supported: $path" if $path =~ qr{^[^/]+/};

    return $path;
}

sub _nameMatch {
    return ($_[0] eq '*' or $_[0] eq $_[1]); 
}

# read to the next element
# TODO: use XML::LibXML->nextPatternMatch
sub _nextPatternMatch {
    my ($self, $stream, $path) = @_;

    $path =~ s{^//}{};
    $path .= '*' if $path =~ qr{^$|/$};

    my @parts = split '/', $path;
    my $relative = $parts[0] ne '';

    while(1) { 
        return if !$stream->read; # end or error
        next if $stream->nodeType != XML_READER_TYPE_ELEMENT;

#        printf " %d=%d %s:%s==%s\n", $stream->depth, scalar @parts, $stream->nodePath, $stream->name, join('/', @parts);

        my $name = $self->_name($stream);

        if ($relative) {
            if (_nameMatch($parts[0], $name)) {
                last;
            }
        } else {
            if (!_nameMatch($parts[$stream->depth+1], $name)) {
                $stream->nextSibling();
            } elsif ($stream->depth == scalar @parts - 2) {
                last;
            }
        }
    } 

    return 1;
}

sub readNext { 
    my $self   = shift;
    my $stream = blessed $_[0] ? shift() : $self->stream;
    my $path   = defined $_[0] ? _checkPath($_[0]) : $self->path;

    return unless $self->_nextPatternMatch($stream, $path);

    my $xml = $self->readElement($stream);

    return $self->simple ? XML::Struct::Simple->new(
            root        => $self->root, 
            attributes  => $self->attributes,
            depth       => $self->depth,
            content     => $self->content, 
        )->transform($xml) : $xml;
}

*read = \&readNext;


sub readDocument {
    my $self = shift;
    my @document;
   
    while(my $element = $self->read(@_)) {
        return $element unless wantarray;
        push @document, $element;
    }

    return @document;
}

sub _name {
    my ($self, $stream) = @_;

    if ($self->ns eq 'strip') {
        return $stream->localName;
    } elsif( $self->ns eq 'disallow' ) {
        if ( $stream->name =~ /^xmlns(:.*)?$/) {
            croak "namespaces not allowed at line ".$stream->lineNumber;
        }
    }

    return $stream->name;
}


sub readElement {
    my $self   = shift;
    my $stream = @_ ? shift : $self->stream;

    my @element = ($self->_name($stream));

    # TODO: dom or raw
    if (defined $self->depth and $stream->depth >= $self->depth) {
        if ($self->deep eq 'dom') {
            my $dom = $stream->copyCurrentNode(1);
            $stream->next;
            return $dom;
        } elsif ($self->deep eq 'raw') {
            my $xml = $stream->readOuterXml();
            $stream->next;
            return $xml;
        }
        #copyCurrentNode
        #if (defined $self->depth and $self->depth == $stream->depth ) {
        #print $stream->depth." ".$self->deep."!".$element[0]."\n";
        #}
    }

    if ($self->attributes) {
        my $attr = $self->readAttributes($stream);
        my $children = $stream->isEmptyElement ? [ ] : $self->readContent($stream);
        push @element, $attr, $children;
    } elsif( !$stream->isEmptyElement ) {
        push @element, $self->readContent($stream);
    }

    return \@element;
}


sub readAttributes {
    my $self   = shift;
    my $stream = @_ ? shift : $self->stream;

    return { } if $stream->moveToFirstAttribute != 1;

    my $attr = { };
    do {
        if ($self->ns ne 'strip' or $stream->name !~ /^xmlns(:.*)?$/) {
            $attr->{ $self->_name($stream) } = $stream->value;
        }
    } while ($stream->moveToNextAttribute);
    $stream->moveToElement;

    return $attr;
}


sub readContent {
    my $self   = shift;
    my $stream = @_ ? shift : $self->stream;

    my @children;
    while(1) {
        $stream->read;
        my $type = $stream->nodeType;

        last if !$type or $type == XML_READER_TYPE_END_ELEMENT;

        if ($type == XML_READER_TYPE_ELEMENT) {
            push @children, $self->readElement($stream);
        } elsif ($type == XML_READER_TYPE_TEXT or $type == XML_READER_TYPE_CDATA ) {
            push @children, $stream->value;
        } elsif ($type == XML_READER_TYPE_SIGNIFICANT_WHITESPACE && $self->whitespace) {
            push @children, $stream->value;
        }
    }
    
    return \@children; 
}

1;
__END__

=encoding UTF-8

=head1 NAME

XML::Struct::Reader - Read XML streams into XML data structures

=head1 SYNOPSIS

    my $reader = XML::Struct::Reader->new( from => "file.xml" );
    my $data   = $reader->read;

=head1 DESCRIPTION

This module reads an XML stream (via L<XML::LibXML::Reader>) into
L<XML::Struct>/MicroXML data structures.

=head1 METHODS

=head2 read = readNext ( [ $stream ] [, $path ] )

Read the next XML element from a stream. If no path option is specified, the
reader's path option is used ("C<*>" by default, first matching the root, then
every other element). 

=head2 readDocument( [ $stream ] [, $path ] )

Read an entire XML document. In contrast to C<read>/C<readNext>, this method
always reads the entire stream. The return value is the first element (that is
the root element by default) in scalar context and a list of elements in array
context. Multiple elements can be returned for instance when a path was
specified to select document fragments.

=head2 readElement( [ $stream ] )

Read an XML element from a stream and return it as array reference with element name,
attributes, and child elements. In contrast to method C<read>, this method expects
the stream to be at an element node (C<< $stream->nodeType == 1 >>) or bad things
might happed.

=head2 readAttributes( [ $stream ] )

Read all XML attributes from a stream and return a (possibly empty) hash
reference.

=head2 readContent( [ $stream ] )

Read all child elements of an XML element and return the result as (possibly
empty) array reference.  Significant whitespace is only included if option
C<whitespace> is enabled.

=head1 CONFIGURATION

=over

=item from

A source to read from. Possible values include a string or string reference
with XML data, a filename, an URL, a file handle, instances of
L<XML::LibXML::Document> or L<XML::LibXML::Element>, and a hash reference with
options passed to L<XML::LibXML::Reader>.

=item stream

A L<XML::LibXML::Reader> to read from. If no stream has been defined, one must
pass a stream parameter to the C<read...> methods. Setting a source with option
C<from> automatically sets a stream.

=item attributes

Include attributes (enabled by default). If disabled, the representation of
an XML element will be

   [ $name => \@children ]

instead of

   [ $name => \%attributes, \@children ]

=item path

Optional path expression to be used as default value when calling C<read>.
Pathes must either be absolute (starting with "C</>") or consist of a single
element name. The special name "C<*>" matches all element names.

A path is a very reduced form of an XPath expressions (no axes, no "C<..>", no
node tests, C<//> only at the start...).  Namespaces are not supported yet.

=item whitespace

Include ignorable whitespace as text elements (disabled by default)

=item ns

Define how XML namespaces should be processed. By default (value 'C<keep>'),
this document:

    <doc>
      <x:foo xmlns:x="http://example.org/" bar="doz" />
    </doc>

is transformed to this structure, keeping namespace prefixes and declarations 
as unprocessed element names and attributes:

    [ 'doc', {}, [
        [
          'x:foo', {
              'bar' => 'doz',
              'xmlns:x' => 'http://example.org/'
          }
        ]
    ]

Setting this option to 'C<strip>' will remove all namespace prefixes and
namespace declaration attributes, so the result would be:

    [ 'doc', {}, [
        [
          'foo', {
              'bar' => 'doz'
          }
        ]
    ]

Setting this option to 'C<disallow>' results in an error when namespace
prefixes or declarations are read.

Expanding namespace URIs ('C<expand'>) is not supported yet.

=item simple

Convert XML to simple key-value structure (SimpleXML) with
L<XML::Struct::Simple>.

=item depth

Only transform to a given depth, starting at C<0> for the root node. Negative
values, non-numeric values or C<undef> are ignored (unlimited depth as
default).

XML elements below the depth are converted to SimpleXML by default or to
MicroXML if option C<simple> is enabled. This can be configured with option
C<deep>.

This option is useful for instance to access document-oriented XML embedded in
data oriented XML. 

=item deep

How to transform elements below given C<depth>. This option is experimental.

=item root

Include root element when converting to SimpleXML. Disabled by default.

=item content

Name of text content when converting to SimpleXML.

=back

=cut