The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package PICA::Writer;

=head1 NAME

PICA::Writer - Write and count PICA+ records and fields

=cut

use strict;
our $VERSION = "0.55";

=head1 DESCRIPTION

This module contains a simple class to write PICA+ records and fields.
Several output targets (file, GLOB, L<IO:Handle>, string, null) and formats 
(XML, plain, normalized) are supported. The number of written records
and fields is counted so you can also use the class as a simple counter.
Additional statistics of fields and subfields can also be enabled.

=head1 SYNOPSIS

  $writer = PICA::Writer->new( \*STDOUT );
  $writer = PICA::Writer->new( "output.pica" );
  $writer = PICA::Writer->new( \$string, format => 'xml' );
  $writer = PICA::Writer->new( ); # no output

  $writer->start();  # called implicitely by default

  $writer->write( $record );
  $writer->write( @records );
  $writer->write( $field1, $field2, $field3 );
  $writer->write( $comment, $record );

  $writer->output( "output.xml" );  
  $writer->output( \*STDOUT, format => 'plain' );  
  print "Failed to open writer" unless $writer; # PICA::Writer::ERROR == 0

  print $writer->counter() . " records written\n";
  print $writer->fields()  . " fields written\n";

  $writer->reset();  # reset counters

  print $writer->status() == PICA::Writer::ENDED ? "open" : "ended";
 
  $writer->end(); # essential to close end tags in XML and such

  use PICA::Record qw(writerecord);
  writerecord( $record, $file );

=cut

use PICA::Record;
use XML::Writer;
use PICA::Parser;
use IO::Handle;
use IO::Scalar;
use IO::File;
use String::Escape qw(qqbackslash elide);
use Carp qw(croak);

use constant ERROR   => 0;
use constant NEW     => 1;
use constant STARTED => 2;
use constant ENDED   => 3;

use overload 
    'bool' => sub { $_[0]->status };

=head1 METHODS

=head2 new ( [ $output ] [ format => $format ] [ %options ] )

Create a new writer. See the C<output> method for possible parameters. 
The status of the new writer is set to C<PICA::Writer::NEW> (1) or
C<PICA::Writer::ERROR> (0). Boolean conversion is overloaded to return
the status so you can easily check whether a writer is in error status.

The writer can also be used for statistics if you set the 'stats' option.
With stats = 1 statistics is created on field level and with stats = 2
also on subfield level.

=cut

sub new {
    my $class = shift;
    my $self = bless {
        status => NEW,
        io => undef,
        options => {},
        recordcounter => 0,
        fieldcounter => 0,
    }, $class;
    return $self->reset( @_ ? @_ : undef );
}

=head2 output ( [ $output ] [ format => $format ] [ %options ] )

Define the output handler for this writer. Record and field counters are
not reset but the writer is ended with the C<end> method if it had been 
started before. The output handler can be a filename, a GLOB, an
L<IO:Handle> object, a string reference, or C<undef>. In addition you
can specify the output format with the C<format> parameter (C<plain> or
C<xml>) and some options depending on the format, for instance 'pretty =E<gt> 1'
and 'stats =E<gt> 0|1|2'.

The status of the writer is set to C<PICA::Writer::NEW> or C<PICA::Writer::ERROR>.
This methods returns the writer itself which boolean conversion is overloaded to
return the status so you can easily check the return value whether an error occurred.

=cut

sub output {
    my $self = shift;
    my ($output, %options) = @_ % 2 ? @_ : (undef, @_);

    %{ $self->{options} } = %options;
    my $format = $self->{options}->{format};

    if (not defined $output) {
        $self->{io} = undef;
    } elsif ( ref($output) eq 'GLOB' ) {
        $self->{io} = $output;
        PICA::Parser::enable_binmode_encoding( $self->{io} );
    } elsif ( UNIVERSAL::isa('IO::Handle', $output) ) {
        $self->{io} = $output;
        PICA::Parser::enable_binmode_encoding( $self->{io} );
    } elsif ( ref($output) eq 'SCALAR' ) {
        $self->{io} = IO::Scalar->new( $output );
    } else {
        $self->{io} = IO::File->new($output, '>:utf8');
        $format = 'xml' if not defined $format and $output =~ /\.xml$/;
    }

    if ($options{pretty}) {
        $options{DATA_MODE} = 1;
        $options{DATA_INDENT} = 2;
        $options{NAMESPACES} = 1;
        $options{PREFIX_MAP} =  {'info:srw/schema/5/picaXML-v1.0'=>''};
    }

    $format = 'plain' unless defined $format and $format =~ /^(plain|normalized|xml)$/i;

    $self->{options}->{format} = lc($format);
    if ( $format =~ /^xml$/i and defined $output ) {
        $options{OUTPUT} = $self->{io};
        $options{header} = 1 unless defined $options{header};
        $self->{xmlwriter} = PICA::Writer::xmlwriter( %options );
    } else {
        $self->{xmlwriter} = undef;
    }
    
    if (defined $output and not $self->{io}) {
        $self->{status} = ERROR;
    }

    if ( $self->{options}->{stats} ) {
        $self->{fieldstat} = {};
        $self->{subfieldstat} = {} if $self->{options}->{stats} > 1;
    } else {
        $self->{subfieldstat} = undef;
        $self->{fieldstat} = undef;
    }

    return $self;
}

=head2 reset ( [ $output ] )

Reset the writer by setting record and field counters to zero and returning
the writer object. Optionally you can define a new output handler, so the 
following two lines are equal:

  $writer->output( $output )->reset();
  $writer->reset( $output );

The status of the writer will only be changed if you specify a new output handler.

=cut

sub reset {
    my $self = shift;
    $self->output( @_ ) if @_;

    $self->{recordcounter} = 0;
    $self->{fieldcounter} = 0;
    
    return $self;
}

=head2 write ( [ $comment | $record | $field ]* )

Write L<PICA::Field>, L<PICA::Record> objects, and comments (as strings)
and record the writer object. The number of written records and fields is
counted and can be queried with methods counter and fields.

  $writer->write( $record );
  $writer->write( @records );
  $writer->write( "record number " . $writer->counter(), $record );
  $writer->write( $field1, $field2 );

Writing single fields or mixing records and fields may not be possible 
depending on the output format and output handler. 

Returns the writer object so you can chain calls:

  $writer->write( $r1 )->write( $r2 )->end;

=cut

sub write {
    my $self = shift;
    croak('cannot write to a closed writer') if $self->status == ENDED;
    $self->start if $self->status != STARTED;

    my $format = $self->{options}->{format};

    if (UNIVERSAL::isa($_[0],'PICA::Field')) {
        while (@_) {
            my $field = shift;
            if (UNIVERSAL::isa($field,'PICA::Field')) {
                if ($format eq 'plain') {
                    print { $self->{io} } $field->string if $self->{io};
                } elsif ($format eq 'normalized') {
                    print { $self->{io} } $field->normalized() if $self->{io};
                } elsif ($format eq 'xml' and defined $self->{xmlwriter} ) {
                    $field->xml( $self->{xmlwriter} );
                }
                $self->addfieldstat( $field );
            } else {
                croak("Cannot write object of unknown type (PICA::Field expected)!");
            }
        }
    } else {
        my $comment = "";
        while (@_) {
            my $record = shift;
            if ( UNIVERSAL::isa($record, 'PICA::Record') ) {
                if ($format eq 'plain') {
                    print { $self->{io} } "\n"
                        if ($self->{recordcounter} > 0 && $self->{io});
                    print { $self->{io} } $record->string if $self->{io};
                } elsif ($format eq 'normalized') {
                    print { $self->{io} }  "\x1D\x0A" # next record
                        if ($self->{recordcounter} > 0 && $self->{io});
                    print { $self->{io} } $record->normalized() if $self->{io};
                } elsif ($format eq 'xml' and defined $self->{xmlwriter} ) {
                    $record->xml( $self->{xmlwriter} );
                }
                $self->addrecordstat( $record );
            } elsif (ref(\$record) eq 'SCALAR') {
                next if !$record;
                $comment = '# ' . join("\n# ", split(/\n/,$record)) . "\n";
                $comment =~ s/--//g;
                if ($format eq 'xml') {
                    $self->{xmlwriter}->comment( $comment )
                        if defined $self->{xmlwriter};
                } else {
                    print { $self->{io} } $comment if $self->{io};
                }
            } else {
                croak("Cannot write object of unknown type (PICA::Record expected)!");
            }
        }
    }

    return $self;
}

=head2 start ( [ %options ] )

Start writing and return the writer object. Depending on the format and 
output handler a header is written. Afterwards the status is set to
PICA::Writer::STARTED. You can pass optional parameters depending on the
format.

  $writer->start( ); # default
  $writer->start( xslt => 'mystylesheet.xsl' );
  $writer->start( nsprefix => 'pica' );

This method is implicitely called the first time you write to a PICA::Writer
that is not in status PICA::Writer::STARTED..

=cut

sub start {
    my $self = shift;
    croak('cannot start a writer twice') if $self->status == STARTED;
    croak('cannot start a writer in error status') if $self->status == ERROR;

    my $writer = $self->{xmlwriter};
    if ( $self->{options}->{format} eq 'xml' and defined $writer ) {
        if (UNIVERSAL::isa( $writer, 'XML::Writer::Namespaces' )) {
            $writer->startTag( [$PICA::Record::XMLNAMESPACE, 'collection'] );
        } else {
            $writer->startTag( 'collection' );
        }
    }

    $self->{status} = STARTED;

    return $self;
}


=head2 end ( )

Finish writing. Depending on the format and output handler a footer is
written (for instance an XML end tag) and the output handler is closed. 
Afterwards the status is set to PICA::Writer::ENDED. If the writer had
not been started before, the start method is called first. 

Ending or writing to an already ended writer will throw an error. You can
restart an ended writer with the output method or with the start method.

=cut

sub end {
    my $self = shift;
    croak('cannot end a writer in error status') if $self->status == ERROR;
    croak('cannot end a writer twice') if $self->status == ENDED;
    $self->start if $self->status != STARTED;

    if ( $self->{options}->{format} eq 'xml') {
        if ( defined $self->{xmlwriter} ) {
            $self->{xmlwriter}->endTag(); # </collection>
            $self->{xmlwriter}->end(); 
        }
    } else {
        # other supported formats don't need end handling
    }

    $self->{io}->close if defined $self->{io};
    $self->{status} = ENDED;
 
    return $self;
}

=head2 status ( )

Return the status which can be PICA::Writer::NEW, PICA::Writer::STARTED, 
PICA::Writer::ENDED, or PICA::Writer::ERROR.

=cut

sub status {
    my $self = shift;
    return $self->{status};
}

=head2 records ( )

Returns the number of written records.

=cut

sub records {
    my $self = shift;
    return $self->{recordcounter};
}

=head2 counter ( )

Alias for records().

=cut

*counter = *records;

=head2 fields ( )

Returns the number of written fields.

=cut

sub fields {
    my $self = shift;
    return $self->{fieldcounter};
}

=head2 statlines ( )

Return a list of lines with statistics (if stats option had been set).

=cut

sub statlines {
    my $self = shift;

    my @STRINGS = ('?',' ','*','+');
    my @stats = ();

    my $fieldstat = $self->{fieldstat} || { };
    my $subfieldstat = $self->{subfieldstat};

    foreach my $tag (sort { $a cmp $b } keys %{$fieldstat}) {
        my $line = length($tag) < 5 ? "$tag    " : "$tag ";
        $line .= $STRINGS[ $fieldstat->{$tag} ];
        if ( defined $subfieldstat ) {
            my $s = $subfieldstat->{$tag};
            foreach (keys %{$s}) {
                $line .= "\$$_ ";
                $line .= $STRINGS[ $s->{$_}->{occ} ];
                $line .= qqbackslash(elide($s->{$_}->{val},40))
                    if defined $s->{$_}->{val};
                $line .= " "; # TODO: join!
            }
        }
        push @stats, $line;
    }

    return @stats;
}


=head1 FUNCTIONS

=head2 xmlwriter ( %params )

Create a new L<XML::Writer> instance and optionally write XML header
and processing instruction. Relevant parameters include 'header' (boolean),
'xslt', NAMESPACES, PREFIX_MAP.

=cut

sub xmlwriter {
    my %params = @_;

    $params{NAMESPACES} = 1 unless defined $params{NAMESPACES};
    if (not defined $params{PREFIX_MAP} or 
        not defined $params{PREFIX_MAP}->{ $PICA::Record::XMLNAMESPACE }) {
        $params{PREFIX_MAP} = { $PICA::Record::XMLNAMESPACE => 'pica' };
    }
    my $writer = XML::Writer->new( %params );
    $writer->xmlDecl('UTF-8') if $params{header};
    if ($params{xslt}) {
        $writer->pi('xml-stylesheet', 'type="text/xsl" href="' . $params{xslt} . '"');
    }

    return $writer;
}

=head2 PRIVATE METHDOS

=head2 addfieldstat ( $field )

Add a field to the statistics.

=cut

sub addfieldstat {
    my ($self, $field) = @_;
    $self->{fieldcounter}++;

    return unless defined $self->{subfieldstat};

    my $tag = $field->tag;
    my (%o,%v);

    my @content = $field->content;
      #print Dumper($field->content);
    foreach (@content) {
        my ($sf,$value) = @{$_};

        $o{ $sf }++;
        if ( exists $v{ $sf } ) {
            $v{ $sf } = undef unless defined $v{ $sf } and $v{ $sf } eq $value;
        } else {
            $v{ $sf } = $value;
        }
    }

    my $sfstat = $self->{subfieldstat};

    # TODO: order of subfields
    my $all = $sfstat->{$tag};
    if ( $sfstat->{$tag} ) {
        foreach my $sf (keys %{$sfstat->{$tag}}) {
            my $cur = $sfstat->{$tag}->{$sf};
            if ( $o{$sf} ) { # this time also
                # ..
                $cur->{occ} += 2
                    if $o{$sf} > 1 and $cur->{occ} < 2;

                $cur->{val} = undef unless
                    defined $v{$sf} and defined $cur->{val} and $v{$sf} eq $cur->{val};
                delete $v{$sf};
                delete $o{$sf};
            } else { # not this time but before
                $cur->{occ} = $cur->{occ} > 1  ? 0 : 2; 
            }
        }

        # fehlende subfields hinzufügen
        foreach (keys %o) {
            $sfstat->{$tag}->{$_} = { val => $v{$_}, occ => $o{$_} };
        }
    } else {
        $sfstat->{$tag} = {
            map { $_ => { val => $v{$_}, occ => $o{$_} }  } keys %o
        };
    }
    
    # ...stats...
}

=head2 addrecordstat ( $record )

Add a record to the statistics.

=cut

sub addrecordstat {
    my ($self, $record) = @_;
    $self->{recordcounter}++;

    if ( not defined $self->{fieldstat} ) {
        $self->{fieldcounter} += scalar $record->fields;
        return;
    }
    my $fieldstat = $self->{fieldstat};
    
    # add field stats
    my %count; # undef, one, repeatable
    foreach my $field ($record->fields) {
        $self->addfieldstat( $field );
        $count{ $field->tag }++;
    }

    # 1, 3 : unique
    # 0,1,2,3 ?: optional, 1: mandatory, +: repeatable

    foreach my $tag (keys %{$fieldstat}) {
        if ( $count{$tag} ) { # does exist this time but before only once
            if ( $count{$tag} > 1 and $fieldstat->{$tag} < 2 ) {
                $fieldstat->{$tag} += 2;
            } #if $fieldstat->{$tag} < 2;
            delete $count{$tag};
        } else { # has existed before but not this time
            $fieldstat->{$tag} = $fieldstat->{$tag} > 1  ? 0 : 2; 
        }
    }

    # new fields are '1' or '+'
    foreach my $tag (keys %count) {
        $fieldstat->{$tag} = $count{$tag} > 1 ? 3 : 1; # 
    }
}

1;

=head1 AUTHOR

Jakob Voss C<< <jakob.voss@gbv.de> >>

=head1 LICENSE

Copyright (C) 2007-2009 by Verbundzentrale Goettingen (VZG) and Jakob Voss

This library is free software; you can redistribute it and/or modify it
under the same terms as Perl itself, either Perl version 5.8.8 or, at
your option, any later version of Perl 5 you may have available.