The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
use Test::More tests => 162;
use utf8;
use Cpanel::JSON::XS;
use warnings;

is(Cpanel::JSON::XS->new->allow_nonref->utf8->encode("ü"), "\"\xc3\xbc\"");
is(Cpanel::JSON::XS->new->allow_nonref->encode("ü"), "\"ü\"");

is(Cpanel::JSON::XS->new->allow_nonref->ascii->utf8->encode(chr 0x8000), '"\u8000"');
is(Cpanel::JSON::XS->new->allow_nonref->ascii->utf8->pretty->encode(chr 0x10402), "\"\\ud801\\udc02\"\n");

ok not defined eval { Cpanel::JSON::XS->new->allow_nonref->utf8->decode('"ü"') };
like $@, qr/malformed UTF-8/;

is(Cpanel::JSON::XS->new->allow_nonref->decode('"ü"'), "ü");
is(Cpanel::JSON::XS->new->allow_nonref->decode('"\u00fc"'), "ü");

ok not defined eval { decode_json ('"\ud801\udc02' . "\x{10204}\"", 1) };
like $@, qr/Wide character/;

SKIP: {
  skip "5.6", 1 if $] < 5.008;
  is(Cpanel::JSON::XS->new->allow_nonref->decode('"\ud801\udc02' . "\x{10204}\""), "\x{10402}\x{10204}");
}

is(Cpanel::JSON::XS->new->allow_nonref->decode('"\"\n\\\\\r\t\f\b"'), "\"\012\\\015\011\014\010");

my $utf8_love = "I \342\235\244 perl";
is(Cpanel::JSON::XS->new->ascii->encode([$utf8_love]), '["I \u00e2\u009d\u00a4 perl"]', 'utf8 enc ascii');
is(Cpanel::JSON::XS->new->latin1->encode([$utf8_love]), "[\"I \342\235\244 perl\"]", 'utf8 enc latin1');
is(Cpanel::JSON::XS->new->utf8->encode([$utf8_love]), "[\"I \303\242\302\235\302\244 perl\"]", 'utf8 enc utf8');
is(Cpanel::JSON::XS->new->binary->encode([$utf8_love]), '["I \xe2\x9d\xa4 perl"]', 'utf8 enc binary');

SKIP: {
  skip "5.6", 4 if $] < 5.008;
  my $unicode_love = "I ❤ perl";
  is(Cpanel::JSON::XS->new->ascii->encode([$unicode_love]), '["I \u2764 perl"]', 'unicode enc ascii');
  is(Cpanel::JSON::XS->new->latin1->encode([$unicode_love]), "[\"I \\u2764 perl\"]", 'unicode enc latin1');
  is(Cpanel::JSON::XS->new->utf8->encode([$unicode_love]), "[\"I \342\235\244 perl\"]", 'unicode enc utf8');
  is(Cpanel::JSON::XS->new->binary->encode([$unicode_love]), '["I \xe2\x9d\xa4 perl"]', 'unicode enc binary');
}

# TODO: test utf8 hash keys,
# test utf8 strings without any char > 0x80.

# warn on the 66 non-characters as in core
{
  BEGIN { 'warnings'->import($] < 5.014 ? 'utf8' : 'nonchar') }
  my $w = '';
  $SIG{__WARN__} = sub { $w = shift };
  my $d = Cpanel::JSON::XS->new->allow_nonref->decode('"\ufdd0"');
  my $warn = $w;
  {
    no warnings 'utf8';
    is ($d, "\x{fdd0}", substr($warn,0,31)."...");
  }
  like ($warn, qr/^Unicode non-character U\+FDD0 is/);
  $w = '';
  # higher planes
  $d = Cpanel::JSON::XS->new->allow_nonref->decode('"\ud83f\udfff"');
  $warn = $w;
  {
    no warnings 'utf8';
    is ($d, "\x{1ffff}", substr($warn,0,31)."...");
  }
  like ($w, qr/^Unicode non-character U\+1FFFF is/);
  $w = '';
  $d = Cpanel::JSON::XS->new->allow_nonref->decode('"\ud87f\udffe"');
  $warn = $w;
  {
    no warnings 'utf8';
    is ($d, "\x{2fffe}", substr($warn,0,31)."...");
  }
  like ($w, qr/^Unicode non-character U\+2FFFE is/);

  $w = '';
  $d = Cpanel::JSON::XS->new->allow_nonref->decode('"\ud8a4\uddd1"');
  $warn = $w;
  is ($d, "\x{391d1}", substr($warn,0,31)."...");
  is ($w, '');
}
{
  my $w;
  BEGIN { 'warnings'->import($] < 5.014 ? 'utf8' : 'nonchar') }
  $SIG{__WARN__} = sub { $w = shift };
  # no warning with relaxed
  my $d = Cpanel::JSON::XS->new->allow_nonref->relaxed->decode('"\ufdd0"');
  my $warn = $w;
  {
    no warnings 'utf8';
    is ($d, "\x{fdd0}", "no warning with relaxed");
  }
  is($w, undef);
}

# security exploits via ill-formed subsequences
# see http://unicode.org/reports/tr36/#UTF-8_Exploit
# testcases from Encode/t/utf8strict.t
# All these sequences are not handled by the unsafe, fast XS decoder,
# rather passed through to the safe Perl decoder, which detects those.
my @ill =
  (# http://smontagu.damowmow.com/utf8test.html
   # The numbers below, like 2.1.2 are test numbers on this web page
   qq/80/          ,             # 3.1.1
   qq/bf/          ,             # 3.1.2
   qq/80 bf/       ,             # 3.1.3
   qq/80 bf 80/    ,             # 3.1.4
   qq/80 bf 80 bf/ ,             # 3.1.5
   qq/80 bf 80 bf 80/ ,          # 3.1.6
   qq/80 bf 80 bf 80 bf/ ,       # 3.1.7
   qq/80 bf 80 bf 80 bf 80/ ,    # 3.1.8
   qq/80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf/ , # 3.1.9
   qq/c0 20 c1 20 c2 20 c3 20 c4 20 c5 20 c6 20 c7 20 c8 20 c9 20 ca 20 cb 20 cc 20 cd 20 ce 20 cf 20 d0 20 d1 20 d2 20 d3 20 d4 20 d5 20 d6 20 d7 20 d8 20 d9 20 da 20 db 20 dc 20 dd 20 de 20 df 20/ , # 3.2.1
   qq/e0 20 e1 20 e2 20 e3 20 e4 20 e5 20 e6 20 e7 20 e8 20 e9 20 ea 20 eb 20 ec 20 ed 20 ee 20 ef 20/ , # 3.2.2
   qq/f0 20 f1 20 f2 20 f3 20 f4 20 f5 20 f6 20 f7 20/ , # 3.2.3
   qq/f8 20 f9 20 fa 20 fb 20/ , # 3.2.4
   qq/fc 20 fd 20/ ,             # 3.2.5
   qq/c0/ ,                      # 3.3.1
   qq/e0 80/ ,                   # 3.3.2
   qq/f0 80 80/ ,                # 3.3.3
   qq/f8 80 80 80/ ,             # 3.3.4
   qq/fc 80 80 80 80/ ,          # 3.3.5
   qq/df/ ,                      # 3.3.6
   qq/ef bf/ ,                   # 3.3.7
   qq/f7 bf bf/ ,                # 3.3.8
   qq/fb bf bf bf/ ,             # 3.3.9
   qq/fd bf bf bf bf/ ,          # 3.3.10
   qq/c0 e0 80 f0 80 80 f8 80 80 80 fc 80 80 80 80 df ef bf f7 bf bf fb bf bf bf fd bf bf bf bf/ , # 3.4.1
   qq/fe/ ,                      # 3.5.1
   qq/ff/ ,                      # 3.5.2
   qq/fe fe ff ff/ ,             # 3.5.3
   qq/f0 8f bf bf/ ,             # 4.2.3
   qq/f8 87 bf bf bf/ ,          # 4.2.4
   qq/fc 83 bf bf bf bf/ ,       # 4.2.5
   qq/c0 af/ ,                   # 4.1.1  # ! overflow not with perl 5.6
   qq/e0 80 af/ ,                # 4.1.2  # ! overflow not with perl 5.6
   qq/f0 80 80 af/ ,             # 4.1.3  # ! overflow not with perl 5.6
   qq/f8 80 80 80 af/ ,          # 4.1.4  # ! overflow not with perl 5.6
   qq/fc 80 80 80 80 af/ ,       # 4.1.5  # ! overflow not with perl 5.6
   qq/c1 bf/ ,                   # 4.2.1  # ! overflow not with perl 5.6
   qq/e0 9f bf/ ,                # 4.2.2  # ! overflow not with perl 5.6
   qq/c0 80/ ,                   # 4.3.1  # xx! overflow not with perl 5.6
   qq/e0 80 80/ ,                # 4.3.2  # xx! overflow not with perl 5.6
   qq/f0 80 80 80/ ,             # 4.3.3  # xx! overflow not with perl 5.6
   qq/f8 80 80 80 80/ ,          # 4.3.4  # xx! overflow not with perl 5.6
   qq/fc 80 80 80 80 80/ ,       # 4.3.5  # xx! overflow not with perl 5.6
   # non-shortest form of 5c i.e. "\\"
   qq/c1 9c/ ,                            # ! not with perl 5.6
  );

{
  # these are no multibyte codepoints, just raw utf8 bytes,
  # so most of them work with 5.6 also.
  BEGIN { $^W = 1 }
  BEGIN { 'warnings'->import($] < 5.014 ? 'utf8' : 'nonchar') }
  my $w;
  $SIG{__WARN__} = sub { $w = shift };

  for my $ill (@ill) {
    my $o = pack "C*" => map {hex} split /\s+/, $ill;
    my $d = eval { decode_json("[\"$o\"]"); };
    is ($d, undef, substr($@,0,25))
      or diag $w, ' ', $ill, "\t => ", $d->[0], " $@";
    like($@, qr/malformed UTF-8 character/, "ill-formed utf8 <$ill> throws error");
    is($d, undef, "without warning");
    $w = undef;
  }
}