unicode.c - metacpan.org

/* This file is a Unicode library in the programming language C which
   deals with conversions to and from the UTF-8 format. */

/*
  Author: 

  Ben Bullock <benkasminbullock@gmail.com>, <bkb@cpan.org>

  Repository: 
  
  https://github.com/benkasminbullock/unicode-c
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "unicode.h"

#ifdef HEADER

/*  _     _           _ _       
   | |   (_)_ __ ___ (_) |_ ___ 
   | |   | | '_ ` _ \| | __/ __|
   | |___| | | | | | | | |_\__ \
   |_____|_|_| |_| |_|_|\__|___/ */
                             


/* The maximum number of bytes we need to contain any Unicode code
   point as UTF-8 as a C string. This length includes one trailing nul
   byte. */

#define UTF8_MAX_LENGTH 5

/* The maximum possible value of a Unicode code point. See
   http://www.cl.cam.ac.uk/~mgk25/unicode.html#ucs. */

#define UNICODE_MAXIMUM 0x10ffff

/* The maximum possible value which will fit into four bytes of
   UTF-8. This is larger than UNICODE_MAXIMUM. */

#define UNICODE_UTF8_4 0x1fffff

/*  ____      _                                 _                 
   |  _ \ ___| |_ _   _ _ __ _ __   __   ____ _| |_   _  ___  ___ 
   | |_) / _ \ __| | | | '__| '_ \  \ \ / / _` | | | | |/ _ \/ __|
   |  _ <  __/ |_| |_| | |  | | | |  \ V / (_| | | |_| |  __/\__ \
   |_| \_\___|\__|\__,_|_|  |_| |_|   \_/ \__,_|_|\__,_|\___||___/ */
                                                               

/* All of the functions in this library return an int. Negative values
   are used to indicate errors. */

/* For routines which don't use the return value to communicate data
   back to the caller, the following return value indicates a
   successful completion. */

#define UNICODE_OK 0

/* Input is not valid UTF-8, specifically the first byte wasn't
   valid. */

#define UNICODE_BAD_INPUT -1

/* An illegal surrogate pair code was attempted to be turned into
   UTF-8. */

#define UNICODE_SURROGATE_PAIR -2

/* Values not forming a surrogate pair were tried to be converted as
   if they were a surrogate pair. */

#define UNICODE_NOT_SURROGATE_PAIR -3

/* Input which was supposed to be UTF-8 encoded was not, specifically
   the first byte was valid, but the second or third or fourth bytes
   were not valid. If the first byte is not valid, UNICODE_BAD_INPUT
   is returned instead of this. */

#define UNICODE_BAD_UTF8 -4

/* A string which was supposed to contain UTF-8 bytes turned out to
   contain a zero byte as its first byte. */

#define UNICODE_EMPTY_INPUT -5

/* Some UTF-8 bytes were not in the shortest possible form. See
   http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8. */

#define UNICODE_NON_SHORTEST -6

/* There was an attempt to convert a code point which was greater than
   UNICODE_UTF8_4 into UTF-8 bytes. */

#define UNICODE_TOO_BIG -7

/* The Unicode code-point ended with either 0xFFFF or 0xFFFE, meaning
   it cannot be used as a character code point. */

#define UNICODE_NOT_CHARACTER -8

#endif /* def HEADER */

/* https://metacpan.org/source/CHANSEN/Unicode-UTF8-0.60/UTF8.xs#L8 */

static const uint8_t utf8_sequence_len[0x100] = {
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00-0x0F */
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10-0x1F */
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20-0x2F */
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30-0x3F */
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40-0x4F */
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50-0x5F */
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60-0x6F */
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70-0x7F */
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */
    0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */
    4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */
};

/* The return value is how many bytes of UTF-8 character point "c"
   will become. If "c" is not a valid UTF-8 byte, the value
   UNICODE_BAD_INPUT is returned. */

int utf8_bytes (unsigned char c)
{
    int r;
    r = utf8_sequence_len[c];
    if (r == 0) {
	return UNICODE_BAD_INPUT;
    }
    return r;
}

#define FOUR(x)					\
    (((x[0] & 0x07) << 18)			\
     | ((x[1] & 0x3F) << 12)			\
     | ((x[2] & 0x3F) <<  6)			\
     | ((x[3] & 0x3F)))

/* Try to convert "input" from UTF-8 to UCS-2, and return a value even
   if the input is partly broken.  This checks the first byte of the
   input, but it doesn't check the subsequent bytes. */

int
utf8_no_checks (const unsigned char * input, const unsigned char ** end_ptr)
{
    unsigned char c;
    c = input[0];
    switch (utf8_sequence_len[c]) {
    case 1:
        * end_ptr = input + 1;
        return c;

    case 2:
        * end_ptr = input + 2;
        return
            (c & 0x1F)<<6  |
            (input[1] & 0x3F);

    case 3:
        * end_ptr = input + 3;
        return
            (c & 0x0F)<<12 |
            (input[1] & 0x3F)<<6  |
            (input[2] & 0x3F);

    case 4:
        * end_ptr = input + 4;
	return FOUR(input);

    case 0:
	/* fall through */
    default:
	return UNICODE_BAD_INPUT;
    }
}

/* Convert a UTF-8 encoded character in "input" into a number. This
   function returns the unicode value of the UTF-8 character if
   successful, and a negative number if not successful. "end_ptr" is
   set to the next character after the read character on
   success. "end_ptr" is set to the start of input on
   failure. "end_ptr" may not be null. */

int utf8_to_ucs2 (const unsigned char * input, const unsigned char ** end_ptr)
{
    unsigned char c;
    uint8_t l;
    *end_ptr = input;
    c = input[0];
    if (c == 0) {
        return UNICODE_EMPTY_INPUT;
    }
    l = utf8_sequence_len[c];
    if (l == 1) {
        * end_ptr = input + 1;
        return c;
    }
    if (l == 2) {
	/* Two byte case. */
        if (input[1] < 0x80 || input[1] > 0xBF) {
            return UNICODE_BAD_UTF8;
	}
	if (c <= 0xC1) {
	    return UNICODE_NON_SHORTEST;
	}
        * end_ptr = input + 2;
        return
            (c & 0x1F)<<6  |
            (input[1] & 0x3F);
    }
    if (l == 3) {
	/* Three byte case. */
        if (input[1] < 0x80 || input[1] > 0xBF ||
	    input[2] < 0x80 || input[2] > 0xBF) {
            return UNICODE_BAD_UTF8;
	}
	if (c == 0xe0 && input[1] < 0xa0) { 
	    /* We don't need to check the value of input[2], because
	       the if statement above this one already guarantees that
	       it is 10xxxxxx. */
	    return UNICODE_NON_SHORTEST;
	}
        * end_ptr = input + 3;
        return
            (c & 0x0F)<<12 |
            (input[1] & 0x3F)<<6  |
            (input[2] & 0x3F);
    }
    if (l == 4) {
	/* Four byte case. */
	unsigned char d, e, f;
	uint32_t v;
	d = input[1];
	e = input[2];
	f = input[3];

	if (/* c must be 11110xxx. */
	    c >= 0xf8 ||
	    /* d, e, f must be 10xxxxxx. */
	    d < 0x80 || d >= 0xC0 ||
	    e < 0x80 || e >= 0xC0 ||
	    f < 0x80 || f >= 0xC0) {
	    return UNICODE_BAD_UTF8;
	}

	if (c == 0xf0 && d < 0x90) { 
	    /* We don't need to check the values of e and d, because
	       the if statement above this one already guarantees that
	       e and d are 10xxxxxx. */
	    return UNICODE_NON_SHORTEST;
	}
	/* Calculate the code point. */
	v = FOUR(input);
	/* Greater than U+10FFFF */
	if (v > UNICODE_MAXIMUM) {
	    return UNICODE_TOO_BIG;
	}
	/* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */
	if ((v & 0xffff) >= 0xfffe) {
	    return UNICODE_NOT_CHARACTER;
	}
        * end_ptr = input + 4;
	return v;
    }
    return UNICODE_BAD_INPUT;
}

/* Input: a Unicode code point, "ucs2". 

   Output: UTF-8 characters in buffer "utf8". 

   Return value: the number of bytes written into "utf8", or a
   negative number if there was an error.

   This adds a zero byte to the end of the string. It assumes that the
   buffer "utf8" has at least UNICODE_MAX_LENGTH (5) bytes of space to
   write to, without checking. */

int ucs2_to_utf8 (int ucs2, unsigned char * utf8)
{
    if (ucs2 < 0x80) {
        utf8[0] = ucs2;
        utf8[1] = '\0';
        return 1;
    }
    if (ucs2 < 0x800) {
        utf8[0] = (ucs2 >> 6)   | 0xC0;
        utf8[1] = (ucs2 & 0x3F) | 0x80;
        utf8[2] = '\0';
        return 2;
    }
    if (ucs2 < 0xFFFF) {
        utf8[0] = ((ucs2 >> 12)       ) | 0xE0;
        utf8[1] = ((ucs2 >> 6 ) & 0x3F) | 0x80;
        utf8[2] = ((ucs2      ) & 0x3F) | 0x80;
        utf8[3] = '\0';
	if (ucs2 >= 0xD800 && ucs2 <= 0xDFFF) {
	    /* Ill-formed. */
	    return UNICODE_SURROGATE_PAIR;
	}
        return 3;
    }
    if (ucs2 <= UNICODE_UTF8_4) {
	/* http://tidy.sourceforge.net/cgi-bin/lxr/source/src/utf8.c#L380 */
	utf8[0] = 0xF0 | (ucs2 >> 18);
	utf8[1] = 0x80 | ((ucs2 >> 12) & 0x3F);
	utf8[2] = 0x80 | ((ucs2 >> 6) & 0x3F);
	utf8[3] = 0x80 | ((ucs2 & 0x3F));
        utf8[4] = '\0';
        return 4;
    }
    return UNICODE_TOO_BIG;
}

#define UNI_SUR_HIGH_START  0xD800
#define UNI_SUR_HIGH_END    0xDBFF
#define UNI_SUR_LOW_START   0xDC00
#define UNI_SUR_LOW_END     0xDFFF

/* Convert a unicode code point "unicode" into its high and low
   surrogate pair equivalents, in "*hi_ptr" and "*lo_ptr".

   Return value:

   If "unicode" does not need to be a surrogate pair, the error
   UNICODE_NOT_SURROGATE_PAIR is returned, and the values of "*hi_ptr"
   and "*lo_ptr" are undefined. If the conversion is successful,
   UNICODE_OK is returned. */

static const int halfShift  = 10; /* used for shifting by 10 bits */
static const uint32_t halfBase = 0x0010000UL;
#define LOWTENBITS 0x3FF

int
unicode_to_surrogates (unsigned unicode, int * hi_ptr, int * lo_ptr)
{
    int hi = UNI_SUR_HIGH_START;
    int lo = UNI_SUR_LOW_START;
    if (unicode < halfBase) {
	/* Doesn't need to be a surrogate pair, let's recycle this
	   constant here. */
	return UNICODE_NOT_SURROGATE_PAIR;
    }
    unicode -= halfBase;
    hi |= ((unicode >>halfShift) & LOWTENBITS);
    lo |= ((unicode) & LOWTENBITS);
    * hi_ptr = hi;
    * lo_ptr = lo;
    return UNICODE_OK;
}

/* Convert a surrogate pair in "hi" and "lo" to a single Unicode
   value. The return value is the Unicode value. If the return value
   is negative, an error has occurred. If "hi" and "lo" do not form a
   surrogate pair, the error value UNICODE_NOT_SURROGATE_PAIR is
   returned. */

/* https://android.googlesource.com/platform/external/id3lib/+/master/unicode.org/ConvertUTF.c */

int
surrogates_to_unicode (int hi, int lo)
{
    uint32_t u;
    if (hi >= UNI_SUR_HIGH_START && hi <= UNI_SUR_HIGH_END) {
	/* If it's a low surrogate, convert to UTF32. */
	if (lo >= UNI_SUR_LOW_START && lo <= UNI_SUR_LOW_END) {
	    u = ((hi - UNI_SUR_HIGH_START) << halfShift)
	      + (lo - UNI_SUR_LOW_START) + halfBase;
	    return u;
	}
    }
    return UNICODE_NOT_SURROGATE_PAIR;
}

/* Convert surrogate pairs to UTF-8. */

int surrogate_to_utf8 (int hi, int lo, unsigned char * utf8)
{
    int C;
    C = surrogates_to_unicode (hi, lo);
    if (C < 0) {
	return C;
    }
    return ucs2_to_utf8 (C, utf8);
}

/* Given a nul-terminated string "utf8" and a number of Unicode
   characters "n_chars", return the number of bytes into "utf8" at
   which the end of the characters occurs. A negative value indicates
   some kind of error. If the return value is UNICODE_EMPTY_INPUT,
   "utf8" contained a zero byte. This may also return any of the
   values of "utf8_to_ucs2". */

int
unicode_chars_to_bytes (const unsigned char * utf8, int n_chars)
{
    int i;
    const unsigned char * p = utf8;
    int len = strlen ((const char *) utf8);
    if (len == 0 && n_chars != 0) {
	return UNICODE_EMPTY_INPUT;
    }
    for (i = 0; i < n_chars; i++) {
        int ucs2 = utf8_to_ucs2 (p, & p);
        if (ucs2 < 0) {
	    return ucs2;
        }
    }
    return p - utf8;
}

/* Like unicode_count_chars, but without error checks or validation of
   the input. */

int unicode_count_chars_fast (const unsigned char * utf8)
{
    int chars;
    const unsigned char * p;
    chars = 0;
    p = utf8;
    while (*p) {
	int len;
	len = utf8_sequence_len[*p];
	if (len < 0) {
	    return len;
	}
	p += len;
	chars++;
    }
    return chars;
}

/* Given a nul-terminated string "utf8", return the total number of
   Unicode characters it contains.

   Return value

   If an error occurs, this may return UNICODE_BAD_INPUT or any of the
   errors of "utf8_to_ucs2". */

int unicode_count_chars (const unsigned char * utf8)
{
    int chars = 0;
    const unsigned char * p = utf8;
    int len = strlen ((const char *) utf8);
    if (len == 0) {
        return 0;
    }
    while (p - utf8 < len) {
        int ucs2;
        ucs2 = utf8_to_ucs2 (p, & p);
        if (ucs2 < 0) {
            return ucs2;
        }
        chars++;
        if (*p == '\0') {
            return chars;
        }
    }
    /* Cannot be reached in practice, since strlen indicates the null
       byte. */
    return UNICODE_BAD_INPUT;
}

#ifdef HEADER
#define VALID_UTF8 1
#define INVALID_UTF8 0
#endif /* def HEADER */

#ifdef HEADER

/* These are intended for use in switch statements, for example

   switch (c) {
   case BYTE_80_8F:
       do_something;

   They originally come from the Json3 project. */

#define BYTE_80_8F \
      0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86:\
 case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D:\
 case 0x8E: case 0x8F
#define BYTE_80_9F \
      0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86:\
 case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D:\
 case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94:\
 case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B:\
 case 0x9C: case 0x9D: case 0x9E: case 0x9F
#define BYTE_80_BF \
      0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86:\
 case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D:\
 case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94:\
 case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B:\
 case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2:\
 case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9:\
 case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0:\
 case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7:\
 case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE:\
 case 0xBF
#define BYTE_90_BF \
      0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96:\
 case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D:\
 case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4:\
 case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB:\
 case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2:\
 case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9:\
 case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: case 0xBF
#define BYTE_A0_BF \
      0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: case 0xA5: case 0xA6:\
 case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC: case 0xAD:\
 case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: case 0xB3: case 0xB4:\
 case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB:\
 case 0xBC: case 0xBD: case 0xBE: case 0xBF
#define BYTE_C2_DF \
      0xC2: case 0xC3: case 0xC4: case 0xC5: case 0xC6: case 0xC7: case 0xC8:\
 case 0xC9: case 0xCA: case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF:\
 case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4: case 0xD5: case 0xD6:\
 case 0xD7: case 0xD8: case 0xD9: case 0xDA: case 0xDB: case 0xDC: case 0xDD:\
 case 0xDE: case 0xDF
#define BYTE_E1_EC \
      0xE1: case 0xE2: case 0xE3: case 0xE4: case 0xE5: case 0xE6: case 0xE7:\
 case 0xE8: case 0xE9: case 0xEA: case 0xEB: case 0xEC
#define BYTE_EE_EF \
      0xEE: case 0xEF
#define BYTE_F1_F3 \
      0xF1: case 0xF2: case 0xF3
#endif /* def HEADER */

#define UNICODEADDBYTE i++

#define UNICODEFAILUTF8(want) return INVALID_UTF8

#define UNICODENEXTBYTE c=input[i]

/* Given "input" and "input_length", validate "input" byte by byte up
   to "input_length". The return value may be VALID_UTF8 or
   INVALID_UTF8. */

int
valid_utf8 (const unsigned char * input, int input_length)
{
    int i;
    unsigned char c;

    i = 0;

 string_start:

    i++;
    if (i >= input_length) {
	return VALID_UTF8;
    }
    /* Set c separately here since we use a range comparison before
       the switch statement. */
    c = input[i];

    /* Admit all bytes <= 0x80. */
    if (c <= 0x80) {
	goto string_start;
    }

    switch (c) {
    case BYTE_C2_DF:
	UNICODEADDBYTE;
	goto byte_last_80_bf;
	    
    case 0xE0:
	UNICODEADDBYTE;
	goto byte23_a0_bf;
	    
    case BYTE_E1_EC:
	UNICODEADDBYTE;
	goto byte_penultimate_80_bf;
	    
    case 0xED:
	UNICODEADDBYTE;
	goto byte23_80_9f;
	    
    case BYTE_EE_EF:
	UNICODEADDBYTE;
	goto byte_penultimate_80_bf;
	    
    case 0xF0:
	UNICODEADDBYTE;
	goto byte24_90_bf;
	    
    case BYTE_F1_F3:
	UNICODEADDBYTE;
	goto byte24_80_bf;
	    
    case 0xF4:
	UNICODEADDBYTE;
	goto byte24_80_8f;

    }

 byte_last_80_bf:

    switch (UNICODENEXTBYTE) {

    case BYTE_80_BF:
	UNICODEADDBYTE;
	goto string_start;
    default:
	UNICODEFAILUTF8 (XBYTES_80_BF);
    }

 byte_penultimate_80_bf:

    switch (UNICODENEXTBYTE) {

    case BYTE_80_BF:
	UNICODEADDBYTE;
	goto byte_last_80_bf;
    default:
	UNICODEFAILUTF8 (XBYTES_80_BF);
    }

 byte24_90_bf:

    switch (UNICODENEXTBYTE) {

    case BYTE_90_BF:
	UNICODEADDBYTE;
	goto byte_penultimate_80_bf;
    default:
	UNICODEFAILUTF8 (XBYTES_90_BF);
    }

 byte23_80_9f:

    switch (UNICODENEXTBYTE) {

    case BYTE_80_9F:
	UNICODEADDBYTE;
	goto byte_last_80_bf;
    default:
	UNICODEFAILUTF8 (XBYTES_80_9F);
    }

 byte23_a0_bf:

    switch (UNICODENEXTBYTE) {

    case BYTE_A0_BF:
	UNICODEADDBYTE;
	goto byte_last_80_bf;
    default:
	UNICODEFAILUTF8 (XBYTES_A0_BF);
    }

 byte24_80_bf:

    switch (UNICODENEXTBYTE) {

    case BYTE_80_BF:
	UNICODEADDBYTE;
	goto byte_penultimate_80_bf;
    default:
	UNICODEFAILUTF8 (XBYTES_80_BF);
    }

 byte24_80_8f:

    switch (UNICODENEXTBYTE) {

    case BYTE_80_8F:
	UNICODEADDBYTE;
	goto byte_penultimate_80_bf;
    default:
	UNICODEFAILUTF8 (XBYTES_80_8F);
    }
}

/*  _____         _       
   |_   _|__  ___| |_ ___ 
     | |/ _ \/ __| __/ __|
     | |  __/\__ \ |_\__ \
     |_|\___||___/\__|___/ */
                       
/* Below this is code for testing which is not normally compiled. Use
   "make test" to compile the testing version. */

#ifdef TEST

void print_bytes (const unsigned char * bytes)
{
    int i;
    for (i = 0; i < strlen ((const char *) bytes); i++) {
        fprintf (stderr, "%02X", bytes[i]);
    }
    fprintf (stderr, "\n");
}

#define OK(test, message, ...) {			\
	(*count)++;					\
	if (test) {					\
	    printf ("ok %d - ", (*count));		\
	}						\
	else {						\
	    printf ("not ok %d - ", (*count));		\
	}						\
	printf (message, ## __VA_ARGS__);		\
	printf (".\n");					\
    }


void test_ucs2_to_utf8 (const unsigned char * input, int * count)
{
    /* Buffer to print utf8 out into. */
    unsigned char buffer[0x100];
    /* Offset into buffer. */
    unsigned char * offset;
    const unsigned char * start = input;

    offset = buffer;
    while (1) {
        int unicode;
        int bytes;
        const unsigned char * end;
        unicode = utf8_to_ucs2 (start, & end);
        if (unicode == UNICODE_EMPTY_INPUT) {
            break;
	}
	if (unicode < 0) {
	    fprintf (stderr, "%s:%d: unexpected error %d converting unicode.\n",
		     __FILE__, __LINE__, unicode);
	    // exit ok in test
	    exit (EXIT_FAILURE);
	}
        bytes = ucs2_to_utf8 (unicode, offset);
	OK (bytes > 0,  "no bad conversion");
	OK (strncmp ((const char *) offset,
		     (const char *) start, bytes) == 0, 
	    "round trip OK for %X (%d bytes)", unicode, bytes);
        start = end;
        offset += bytes;
#if 0
        printf ("%X %d\n", unicode, bytes);
#endif
    }
    * offset = '\0';
    OK (strcmp ((const char *) buffer, (const char *) input) == 0,
	
	"input %s resulted in identical output %s",
	input, buffer);
}

static void
test_invalid_utf8 (int * count)
{
    unsigned char invalid_utf8[UTF8_MAX_LENGTH];
    int unicode;
    int valid;
    const unsigned char * end;
    snprintf ((char *) invalid_utf8, UTF8_MAX_LENGTH - 1,
	      "%c%c%c", 0xe8, 0xe4, 0xe5);
    unicode = utf8_to_ucs2 (invalid_utf8, & end);
    OK (unicode == UNICODE_BAD_UTF8, 
	"invalid UTF-8 gives incorrect result");
    valid = valid_utf8 (invalid_utf8, strlen ((char *) invalid_utf8));
    OK (valid == INVALID_UTF8, "Invalid UTF-8 fails valid_utf8");
}

static void
test_surrogate_pairs (int * count)
{
    int status;
    int hi;
    int lo;
    int rt;
    /* This is the wide character space, which does not require
       representation as a surrogate pair. */
    unsigned nogood = 0x3000;
    /* 
       Two examples from the Wikipedia article on UTF-16
       https://en.wikipedia.org/w/index.php?title=UTF-16&oldid=744329865#Examples. */
    unsigned wikipedia_1 = 0x10437;
    unsigned wikipedia_2 = 0x24b62;
    /*
      An example from the JSON RFC
      http://rfc7159.net/rfc7159#rfc.section.7
    */
    unsigned json_spec = 0x1D11E;

    status = unicode_to_surrogates (nogood, & hi, & lo);

    OK (status == UNICODE_NOT_SURROGATE_PAIR, 
	"low value to surrogate pair breaker returns error");

    status = unicode_to_surrogates (wikipedia_1, & hi, & lo);
    OK (status == UNICODE_OK,  "Ok with %X", wikipedia_1);
    OK (hi == 0xD801,  "Got expected %X == 0xD801", hi);
    OK (lo == 0xDC37,  "Got expected %X == 0xDC37", lo);
    rt = surrogates_to_unicode (hi, lo);
    OK (rt == wikipedia_1,  "Round trip %X == initial %X",
	rt, wikipedia_1);

    status = unicode_to_surrogates (wikipedia_2, & hi, & lo);
    OK (status == UNICODE_OK,  "Ok with %X", wikipedia_1);
    OK (hi == 0xD852,  "Got expected %X == 0xD852", hi);
    OK (lo == 0xDF62,  "Got expected %X == 0xDF62", lo);
    rt = surrogates_to_unicode (hi, lo);
    OK (rt == wikipedia_2,  "Round trip %X == initial %X",
	rt, wikipedia_2);

    status = unicode_to_surrogates (json_spec, & hi, & lo);
    OK (status == UNICODE_OK,  "Ok with %X", json_spec);
    OK (hi == 0xD834,  "Got expected %X == 0xD834", hi);
    OK (lo == 0xDd1e,  "Got expected %X == 0xDD1e", lo);
    rt = surrogates_to_unicode (hi, lo);
    OK (rt == json_spec,  "Round trip %X == initial %X",
	rt, json_spec);
}

static void test_utf8_bytes (int * count)
{
    struct tub {
	int first;
	int expect;
    } tests[] = {
	{'a',1},
	{0xb0,UNICODE_BAD_INPUT},
	{0xc2,2},
	{0xff,UNICODE_BAD_INPUT},
    };
    int n_tests = sizeof (tests) / sizeof (struct tub);
    int i;
    for (i = 0; i < n_tests; i++) {
	/* Expected bytes. */
	int xbytes;
	int firstbyte;
	firstbyte = tests[i].first;
	xbytes = utf8_bytes (firstbyte);
	OK (xbytes == tests[i].expect, "Got %d (%d) with input %d",
	    xbytes, tests[i].expect, firstbyte);
    }
}

static const unsigned char * utf8 = (unsigned char *) "漢数字ÔÕÖＸ";

static void
test_utf8_to_ucs2 (int * count)
{
    const unsigned char * start = utf8;
    while (*start) {
        int unicode;
        const unsigned char * end;
        unicode = utf8_to_ucs2 (start, & end);
	OK (unicode > 0, "no bad value at %s", start);
        printf ("# %s is %04X, length is %d\n", start, unicode, end - start);
        start = end;
    }
}

static void
test_unicode_count_chars (int * count)
{
    int cc;
    cc = unicode_count_chars (utf8);
    OK (cc == 7, "unicode_count_chars gets seven characters for utf8");
    cc = unicode_count_chars_fast (utf8);
    OK (cc == 7, "unicode_count_chars_fast gets seven characters for utf8");
}

static void
test_valid_utf8 (int * count)
{
    int valid;
    valid = valid_utf8 (utf8, strlen ((const char *) utf8));
    OK (valid == VALID_UTF8, "Valid UTF-8 passes valid_utf8");
}

int main ()
{
    /* Test counter for TAP. */
    int count;
    count = 0;
    test_utf8_to_ucs2 (& count);
    test_ucs2_to_utf8 (utf8, & count);
    test_invalid_utf8 (& count);
    test_unicode_count_chars (& count);
    test_surrogate_pairs (& count);
    test_utf8_bytes (& count);
    test_valid_utf8 (& count);
    printf ("1..%d\n", count);
}

#endif /* def TEST */
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)