The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package JKML;
use 5.008005;
use strict;
use warnings;
use parent qw(Exporter);
use Encode ();
use MIME::Base64 ();
use Types::Serialiser;
use B ();
use Scalar::Util ();

our @EXPORT = qw(decode_jkml encode_jkml);

our $VERSION = "0.01";

our @HERE_QUEUE;
our $SELF;

# JKML is based on JSON::Tiny.
# JSON::Tiny was "Adapted from Mojo::JSON and Mojo::Util".

# Licensed under the Artistic 2.0 license.
# http://www.perlfoundation.org/artistic_license_2_0.

sub new {
  my $class = shift;
  bless {
    functions => {
      base64 => \&MIME::Base64::decode_base64,
    }
  }, $class;
}

sub call {
  my ($self, $name, $vref) = @_;
  my $code = $self->{functions}->{$name};
  unless ($code) {
    _exception("Unknown function: $name");
  }
  $code->($vref);
}

my $TRUE  = $Types::Serialiser::true;
my $FALSE = $Types::Serialiser::false;

# Escaped special character map (with u2028 and u2029)
my %ESCAPE = (
  '"'     => '"',
  '\\'    => '\\',
  '/'     => '/',
  'b'     => "\x08",
  'f'     => "\x0c",
  'n'     => "\x0a",
  'r'     => "\x0d",
  't'     => "\x09",
  'u2028' => "\x{2028}",
  'u2029' => "\x{2029}"
);
my %REVERSE = map { $ESCAPE{$_} => "\\$_" } keys %ESCAPE;

for( 0x00 .. 0x1f, 0x7f ) {
  my $packed = pack 'C', $_;
  $REVERSE{$packed} = sprintf '\u%.4X', $_
    if ! defined( $REVERSE{$packed} );
}

my $WHITESPACE_RE = qr/[\x20\x09]/;
my $COMMENT_RE = qr!#[^\n]*(?:\n|\z)!;
my $IGNORABLE_RE = qr!(?:$WHITESPACE_RE|$COMMENT_RE)*!;
my $LEFTOVER_RE = qr!(?:$WHITESPACE_RE|$COMMENT_RE|[\x0d\x0a])*!;

sub decode_jkml { JKML->new->decode(shift) }
sub encode_jkml { JKML->new->encode(shift) }


sub decode {
  my ($self, $bytes) = @_;

  local $SELF = $self;
  local @HERE_QUEUE;

  # Missing input
  die 'Missing or empty input' unless $bytes;

  # Wide characters
  die 'Wide character in input'
    unless utf8::downgrade($bytes, 1);

  # Object or array
  my $res = eval {
    local $_ = $bytes;

    # Leading whitespace
    _skip_space();

    # value
    my $ref;
    _decode_value(\$ref);

    # Leftover data
    _skip_space();
    unless (pos() == length($_)) {
      my $got = ref $ref ? lc(ref($ref)) : 'scalar';
      _exception("Unexpected data after $got");
    }

    $ref;
  };

  # Exception
  if (!$res && (my $e = $@)) {
    chomp $e;
    die $e;
  }

  return $res;
}

sub false {$FALSE}

sub true {$TRUE}

sub _decode_array {
  my @array;
  _skip_space();
  until (m/\G\]/gc) {

    # Value
    _decode_value(\($array[0+@array]));

    # Separator
    my $found_separator = 0;
    _skip_space();
    if (m/\G,/gc) {
        $found_separator++;
    }

    # End
    _skip_space();
    last if m/\G\]/gc;

    _skip_space();
    redo if $found_separator;

    # Invalid character
    _exception('Expected comma or right square bracket while parsing array');
  }

  return \@array;
}

sub _decode_object {
  my %hash;
  _skip_space();
  until (m/\G\}/gc) {
    # Key
    my $key = do {
      _skip_space();
      if (m/\G([A-Za-z][a-zA-Z0-9_]*)/gc) {
        $1;
      } else {
        # Quote
        m/\G"/gc
            or _exception('Expected string while parsing object');

        _decode_string();
      }
    };

    # Colon
    _skip_space();
    m/\G=>/gc
      or _exception('Expected "=>" while parsing object');

    # Value
    _decode_value(\$hash{$key});

    # Separator
    _skip_space();
    my $found_separator = 0;
    $found_separator++ if m/\G,/gc;

    # End
    _skip_space();
    last if m/\G\}/gc;

    _skip_space();
    redo if $found_separator;

    # Invalid character
    _exception('Expected comma or right curly bracket while parsing object');
  }

  return \%hash;
}

sub _decode_string {
  my $quote = shift;
  my $pos = pos;
  # Extract string with escaped characters
  m!\G((?:(?:[^\x00-\x1f\\${quote}]|\\(?:[${quote}\\/bfnrt]|u[0-9a-fA-F]{4})){0,32766})*)!gc; # segfault on 5.8.x in t/20-mojo-json.t #83
  my $str = $1;

  # Invalid character
  unless (m/\G${quote}/gc) {
    _exception('Unexpected character or invalid escape while parsing string')
      if m/\G[\x00-\x1f\\]/;
    _exception('Unterminated string');
  }

  # Unescape popular characters
  if (index($str, '\\u') < 0) {
    $str =~ s!\\(["\\/bfnrt])!$ESCAPE{$1}!gs;
    return $str;
  }

  # Unescape everything else
  my $buffer = '';
  while ($str =~ m/\G([^\\]*)\\(?:([^u])|u(.{4}))/gc) {
    $buffer .= $1;

    # Popular character
    if ($2) { $buffer .= $ESCAPE{$2} }

    # Escaped
    else {
      my $ord = hex $3;

      # Surrogate pair
      if (($ord & 0xf800) == 0xd800) {

        # High surrogate
        ($ord & 0xfc00) == 0xd800
          or pos($_) = $pos + pos($str), _exception('Missing high-surrogate');

        # Low surrogate
        $str =~ m/\G\\u([Dd][C-Fc-f]..)/gc
          or pos($_) = $pos + pos($str), _exception('Missing low-surrogate');

        # Pair
        $ord = 0x10000 + ($ord - 0xd800) * 0x400 + (hex($1) - 0xdc00);
      }

      # Character
      $buffer .= pack 'U', $ord;
    }
  }

  # The rest
  return $buffer . substr $str, pos($str), length($str);
}

sub _skip_space {
  while (m/\G$IGNORABLE_RE/gc) {
    if (m/\G\x0d?\x0a/gc) {
      if (@HERE_QUEUE) {
        my ($v, $terminator) = @{pop @HERE_QUEUE};
        if (m/\G(.*?)^([ \t]*)$terminator$/gcsm) {
          my $buf = $1;
          my $dedent = $2;
          $buf =~ s/^$dedent//mg;
          $buf =~ s/\n\z//;
          $$v = $buf;
        } else {
          _exception("Unexpected EOF in heredoc: '$terminator'");
        }
      }
    }
  }
}

sub _decode_value {
  my $r = shift;

  # Leading whitespace
  _skip_space();

  # funcall
  if (m/\G([a-zA-Z][a-zA-Z0-9_]*)\(/gc) {
    my $func = $1;
    _decode_value(\my $v);
    m/\G\)/gc or _exception("Missing ) after funcall");
    return $$r = $SELF->call($func, $v);
  }

  # heredoc
  if (m/\G<<-([A-Za-z.]+)/gc) {
    push @HERE_QUEUE, [$r, $1];
    return;
  }

  # Raw string
  return $$r = $2 if m/\Gr('''|""""|'|")(.*?)\1/gc;

  # String
  return $$r = _decode_string($1) if m/\G(["'])/gc;

  # Array
  return $$r = _decode_array() if m/\G\[/gc;

  # Object
  return $$r = _decode_object() if m/\G\{/gc;

  # Number
  return $$r = 0 + $1
    if m/\G([-]?(?:0|[1-9][0-9]*)(?:\.[0-9]*)?(?:[eE][+-]?[0-9]+)?)/gc;

  # True
  return $$r = $TRUE if m/\Gtrue/gc;

  # False
  return $$r = $FALSE if m/\Gfalse/gc;

  # Null
  return $$r = undef if m/\Gnull/gc;  ## no critic (return)

  # Invalid character
  _exception('Expected string, array, object, number, boolean or null');
}

sub _exception {

  # Leading whitespace
  _skip_space();

  # Context
  my $context = 'Malformed JKML: ' . shift;
  if (m/\G\z/gc) { $context .= ' before end of data' }
  else {
    my @lines = split /\n/, substr($_, 0, pos);
    $context .= ' at line ' . @lines . ', offset ' . length(pop @lines || '');
  }

  die "$context\n";
}

# -------------------------------------------------------------------------

sub encode {
  my ($self, $ref) = @_;
  return Encode::encode 'UTF-8', _encode_value($ref);
}

sub _encode_array {
  my $array = shift;
  return '[' . join(',', map { _encode_value($_) } @$array) . ']';
}

sub _encode_object {
  my $object = shift;
  my @pairs = map { _encode_string($_) . '=>' . _encode_value($object->{$_}) }
    keys %$object;
  return '{' . join(',', @pairs) . '}';
}

sub _encode_string {
  my $str = shift;
  $str =~ s!([\x00-\x1f\x7f\x{2028}\x{2029}\\"/\b\f\n\r\t])!$REVERSE{$1}!gs;
  return "\"$str\"";
}

sub _encode_value {
  my $value = shift;

  # Reference
  if (my $ref = ref $value) {

    # Array
    return _encode_array($value) if $ref eq 'ARRAY';

    # Object
    return _encode_object($value) if $ref eq 'HASH';

    # True or false
    return Types::Serialiser::is_true($value)  ? 'true' : 'false' if Types::Serialiser::is_bool($ref);

    # References to scalars (including blessed) will be encoded as Booleans.
    return $$value ? 'true' : 'false' if $ref =~ /SCALAR/;

  }

  # Null
  return 'null' unless defined $value;

  # Number
  my $flags = B::svref_2object(\$value)->FLAGS;
  return 0 + $value if $flags & (B::SVp_IOK | B::SVp_NOK) && $value * 0 == 0;

  # String
  return _encode_string($value);
}

1;
__END__

=encoding utf-8

=head1 NAME

JKML - Just K markup language

=head1 SYNOPSIS

    use JKML;

    decode_jkml(<<'...');
    [
      {
        # heh.
        input => "hoghoge",
        expected => "hogehoge",
        description => <<-EOF,
        This markup language is human writable.
        
        JKML supports following features:

          * heredoc
          * raw string.
          * comments
        EOF
        regexp => r" ^^ \s+ ",
      }
    ]
    ...

=head1 DESCRIPTION

JKML is parser library for JKML. JKML is yet another markup language.

B<This module is alpha state. Any API will change without notice.>

=head2 What's difference between JSON?

JKML extends following features:

=over 4

=item Raw strings

=item Comments

=back

These features are very useful for writing test data.

=head1 JKML and encoding

You MUST use UTF-8 for every JKML data.

=head1 JKML Grammar

=over 4

=item Raw strings

JKML allows raw strings. Such as following:

    raw_string =
          "r'" .*? "'"
        | 'r"' .*? '"'
        | 'r"""' .*? '"""'
        | "r'''" .*? "'''"

Every raw string literals does not care about non terminater characters.

  r'hoge'
  r"hoge"
  r"""hoge"""
  r'''hoge'''

=item Comments

Perl5 style comemnt is allowed.

    # comment

=item String

String literal is compatible with JSON.

        string = sqstring | dqstring

         dqstring = '"' dqchar* '"'
         sqstring = "'" sqchar* "'"

         dqchar = unescaped | "'" | escaped
         sqchar = unescaped | '"' | escaped

         escaped = escape (
                    %x22 /          ; "    quotation mark  U+0022
                    %x5C /          ; \    reverse solidus U+005C
                    %x2F /          ; /    solidus         U+002F
                    %x62 /          ; b    backspace       U+0008
                    %x66 /          ; f    form feed       U+000C
                    %x6E /          ; n    line feed       U+000A
                    %x72 /          ; r    carriage return U+000D
                    %x74 /          ; t    tab             U+0009
                    %x75 4HEXDIG )  ; uXXXX                U+XXXX

         escape = %x5C              ; \

         quotation-mark = %x22      ; "

         unescaped = [^"'\]

=item Number

Number literal is compatible with JSON. See JSON RFC.

    3
    3.14
    3e14

=item Map

Map literal's grammar is:

    pair = string "=>" value
    map = "{" "}"
        | "{" pair ( "," pair )* ","?  "}"

You can omit quotes for keys, if you don't want to type it.

You can use trailing comma unlike JS.

Examples:

    {
        a => 3,
        "b" => 4,
    }

=item Array

    array = "[" "]"
          | "[" value ( "," value )* ","? "]"

Examples:

    [1,2,3]
    [1,2,3,]

=item heredoc

Ruby style heredoc.

    <<-TOC
    hoghoge
    TOC

=item Value

    value = map | array | string | raw_string | number | funcall

=item Boolean

    bool = "true" | "false"

=item NULL

    null = "null"

Will decode to C<undef>.

=item Function call

  funcall = ident "(" value ")"
  ident = [a-zA-Z_] [a-zA-Z0-9_]*

JKML supports some builtin functions.

=back

=head1 Builtin functions

=over 4

=item base64

Decode base64 string.

    base64(string)

=back

=head1 AUTHOR

tokuhirom E<lt>tokuhirom@gmail.comE<gt>

=head1 LICENSE

Copyright (C) tokuhirom

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=head2 JSON::Tiny LICENSE

This library uses JSON::Tiny's code. JSON::Tiny's license term is following:

Copyright 2012-2013 David Oswald.

This program is free software, you can redistribute it and/or modify it under the terms of the Artistic License version 2.0.
See http://www.perlfoundation.org/artistic_license_2_0 for more information.


=cut