The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#ifndef SCMUNI_H
#define SCMUNI_H

#define VALID_UTF_MAX		(0x10FFFF)
#define Is_VALID_UTF(uv)	((uv) <= VALID_UTF_MAX)

#define UTF16_IS_SURROG(uv)	(0xD800 <= (uv) && (uv) <= 0xDFFF)
#define UTF16_HI_SURROG(uv)	(0xD800 <= (uv) && (uv) <= 0xDBFF)
#define UTF16_LO_SURROG(uv)	(0xDC00 <= (uv) && (uv) <= 0xDFFF)

#define UTF8A_SKIP(uv)	\
	( (uv) < 0x80           ? 1 : \
	  (uv) < 0x800          ? 2 : \
	  (uv) < 0x10000        ? 3 : \
	  (uv) < 0x200000       ? 4 : \
	  (uv) < 0x4000000      ? 5 : \
	  (uv) < 0x80000000     ? 6 : 7 )

#define UTF8A_TRAIL(c)	(((c) & 0xC0) == 0x80)

static UV
ord_in_utf16le(U8 *s, STRLEN curlen, STRLEN *retlen)
{
    UV uv, luv;
    U8 *p = s;

    if (curlen < 2) {
	if (retlen)
	    *retlen = 0;
	return 0;
    }

    uv = (UV)((p[1] << 8) | p[0]);
    p += 2;

    if (UTF16_HI_SURROG(uv) && (4 <= curlen)) {
	luv = (UV)((p[1] << 8) | p[0]);

	if (UTF16_LO_SURROG(luv)) {
	    uv = 0x10000 + ((uv-0xD800) * 0x400) + (luv-0xDC00);
	    p += 2;
	}
    }

    if (retlen)
	*retlen = p - s;
    return uv;
}


static UV
ord_in_utf16be(U8 *s, STRLEN curlen, STRLEN *retlen)
{
    UV uv, luv;
    U8 *p = s;

    if (curlen < 2) {
	if (retlen)
	    *retlen = 0;
	return 0;
    }

    uv = (UV)((p[0] << 8) | p[1]);
    p += 2;

    if (UTF16_HI_SURROG(uv) && (4 <= curlen)) {
	luv = (UV)((p[0] << 8) | p[1]);

	if (UTF16_LO_SURROG(luv)) {
	    uv = 0x10000 + ((uv-0xD800) * 0x400) + (luv-0xDC00);
	    p += 2;
	}
    }

    if (retlen)
	*retlen = p - s;
    return uv;
}


static UV
ord_in_utf32le(U8 *s, STRLEN curlen, STRLEN *retlen)
{
    if (curlen < 4) {
	if (retlen)
	    *retlen = 0;
	return 0;
    }

    if (retlen)
	*retlen = 4;
    return (UV)((s[3] << 24) | (s[2] << 16) | (s[1] << 8) | s[0]);
}


static UV
ord_in_utf32be(U8 *s, STRLEN curlen, STRLEN *retlen)
{
    if (curlen < 4) {
	if (retlen)
	    *retlen = 0;
	return 0;
    }

    if (retlen)
	*retlen = 4;
    return (UV)((s[0] << 24) | (s[1] << 16) | (s[2] << 8) | s[3]);
}


static UV
ord_in_utf8(U8 *s, STRLEN curlen, STRLEN *retlen)
{
    UV uv = 0;
    STRLEN len, i;

    if (curlen == 0) {
	if (retlen)
	    *retlen = 0;
	return 0;
    }

    len = *s < 0x80 ? 1 :
	  *s < 0xC0 ? 0 :
	  *s < 0xE0 ? 2 :
	  *s < 0xF0 ? 3 :
	  *s < 0xF8 ? 4 : 0;

    if (curlen < len || len == 0) {
	if (retlen)
	    *retlen = 0;
	return 0;
    }

    if (*s < 0x80) {
	uv = (UV)*s;
    }
    else if (*s < 0xE0) {
	uv = (UV)(((s[0] & 0x1f) << 6) | (s[1] & 0x3f));
    }
    else if (*s < 0xF0) {
	uv = (UV)(((s[0] & 0x0f) << 12) |
		  ((s[1] & 0x3f) <<  6) | (s[2] & 0x3f));
    }
    else if (*s < 0xF8) {
	uv = (UV)(((s[0] & 0x07) << 18) | ((s[1] & 0x3f) << 12) |
		  ((s[2] & 0x3f) <<  6) |  (s[3] & 0x3f));
    }

    for (i = 1; i < len; i++) {
	if (!UTF8A_TRAIL(s[i])) {
	    len = 0;
	    break;
	}
    }

    if (len != (STRLEN) UTF8A_SKIP(uv))
	len = 0;

    if (retlen)
	*retlen = len;
    return uv;
}


static U8*
app_in_utf16le(U8* s, UV uv)
{
    if (uv <= 0xFFFF) {
	*s++ = (U8)(uv & 0xff);
	*s++ = (U8)(uv >> 8);
    }
    else if (Is_VALID_UTF(uv)) {
	int hi, lo;
	uv -= 0x10000;
	hi = (0xD800 | (uv >> 10));
	lo = (0xDC00 | (uv & 0x3FF));
	*s++ = (U8)(hi & 0xff);
	*s++ = (U8)(hi >> 8);
	*s++ = (U8)(lo & 0xff);
	*s++ = (U8)(lo >> 8);
    }
    return s;
}


static U8*
app_in_utf16be(U8* s, UV uv)
{
    if (uv <= 0xFFFF) {
	*s++ = (U8)(uv >> 8);
	*s++ = (U8)(uv & 0xff);
    }
    else if (Is_VALID_UTF(uv)) {
	int hi, lo;
	uv -= 0x10000;
	hi = (0xD800 | (uv >> 10));
	lo = (0xDC00 | (uv & 0x3FF));
	*s++ = (U8)(hi >> 8);
	*s++ = (U8)(hi & 0xff);
	*s++ = (U8)(lo >> 8);
	*s++ = (U8)(lo & 0xff);
    }
    return s;
}


static U8*
app_in_utf32le(U8* s, UV uv)
{
    if (Is_VALID_UTF(uv)) {
	*s++ = (U8)((uv      ) & 0xff);
	*s++ = (U8)((uv >>  8) & 0xff);
	*s++ = (U8)((uv >> 16) & 0xff);
	*s++ = (U8)((uv >> 24) & 0xff);
    }
    return s;
}


static U8*
app_in_utf32be(U8* s, UV uv)
{
    if (Is_VALID_UTF(uv)) {
	*s++ = (U8)((uv >> 24) & 0xff);
	*s++ = (U8)((uv >> 16) & 0xff);
	*s++ = (U8)((uv >>  8) & 0xff);
	*s++ = (U8)((uv      ) & 0xff);
    }
    return s;
}


static U8*
app_in_utf8(U8* s, UV uv)
{
    if (uv < 0x80) {
	*s++ = (U8)(uv & 0xff);
    }
    else if (uv < 0x800) {
	*s++ = (U8)(( uv >>  6)         | 0xc0);
	*s++ = (U8)(( uv        & 0x3f) | 0x80);
    }
    else if (uv < 0x10000) {
	*s++ = (U8)(( uv >> 12)         | 0xe0);
	*s++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
	*s++ = (U8)(( uv        & 0x3f) | 0x80);
    }
    else if (Is_VALID_UTF(uv)) {
	*s++ = (U8)(( uv >> 18)         | 0xf0);
	*s++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
	*s++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
	*s++ = (U8)(( uv        & 0x3f) | 0x80);
    }
    return s;
}

#endif