The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* -*- Mode: c; c-basic-offset: 2 -*-
 *
 * raptor_utf8.c - Raptor Unicode Normal Form C (NFC) support
 *
 * Copyright (C) 2004-2006, David Beckett http://purl.org/net/dajobe/
 * Copyright (C) 2004-2004, University of Bristol, UK http://www.bristol.ac.uk/
 * 
 * This package is Free Software and part of Redland http://librdf.org/
 * 
 * It is licensed under the following three licenses as alternatives:
 *   1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
 *   2. GNU General Public License (GPL) V2 or any newer version
 *   3. Apache License, V2.0 or any newer version
 * 
 * You may not use this file except in compliance with at least one of
 * the above three licenses.
 * 
 * See LICENSE.html or LICENSE.txt at the top of this package for the
 * complete terms and further detail along with the license texts for
 * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
 * 
 *
 * See Unicode Normalization http://unicode.org/unicode/reports/tr15/
 * for the definition of Unicode Normal Form C (NFC)
 * 
 */


#ifdef HAVE_CONFIG_H
#include <raptor_config.h>
#endif

#ifdef WIN32
#include <win32_raptor_config.h>
#endif

#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif

#include <stdio.h>
#include <stdarg.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif

/* Raptor includes */
#include "raptor.h"
#include "raptor_internal.h"
#include "raptor_nfc.h"


#undef RAPTOR_DEBUG_NFC_CHECK


/*
 * raptor_nfc_check_combiners - Check for allowed combining characters
 * @base: first character (U+0...U+FFFF)
 * @follow: second character (U+0...U+FFFF)
 *
 * Return value: non-0 if combination is allowed
 */
static int
raptor_nfc_check_combiners(u16 base, u16 follow)
{
  int low=0;
  int high=RAPTOR_NFC_RECOMBINERS_COUNT;

  while(low < high) {
    int middle=(low+high)/2;
    u16 middle_base=raptor_nfc_recombiners[middle].base;
    
    if(base == middle_base) {
      /* found base */
      u16 middle_follow=raptor_nfc_recombiners[middle].follow;
      if(middle_follow == follow) 
        /* success */
        return 1;

      /* otherwise need to binary search for further 'follow'
       * values for this base 
       */
      if(follow < middle_follow)
        high=middle;
      else
        low=middle+1;
    } else if(base < middle_base)
      high=middle;
    else
      low=middle+1;
  }

  return raptor_nfc_recombiners[low].base == base &&
         raptor_nfc_recombiners[low].follow == follow;
}



static int
raptor_nfc_get_class(unsigned long key)
{
  int low=0;
  int high=RAPTOR_NFC_CLASSES_COUNT;

  while (low < high) {
    int middle=(low+high)/2;
    unsigned int middle_key=raptor_nfc_classes[middle].key;

    /* found class */
    if(key == middle_key)
      return raptor_nfc_classes[middle].combining_class;

    /* otherwise need to binary search further */
    if(key < middle_key)
      high=middle;
    else
      low=middle+1;
  }

  return raptor_nfc_classes[low].combining_class;
}


static RAPTOR_INLINE raptor_nfc_code_flag
raptor_nfc_get_code_flag (unsigned long c)
{
  if(c < 0x10900)  {
    /* U+0 to U+108FF - from flags table (first 0x10900 entries) */
    if(c & 1)
      return (raptor_nfc_code_flag)(raptor_nfc_flags[c>>1] & 0xF);
    else
      return (raptor_nfc_code_flag)(raptor_nfc_flags[c>>1] >>4);
  } else if(c < 0x1D000)
    /* U+10900 to U+1CFFF - codes do not exist */
    return NoNo;
  else if(c < 0x1D800) {
    /* U+1D000 to U+1D7FF - from flags table (after first 0x10900) */
    c -= (0x1D000-0x10900);
    if(c & 1)
      return (raptor_nfc_code_flag)(raptor_nfc_flags[c>>1] & 0xF);
    else
      return (raptor_nfc_code_flag)(raptor_nfc_flags[c>>1] >>4);
  } else if(c < 0x20000)
    /* U+1D800 to U+1FFFF - codes do not exist */
    return NoNo;
  else if(c < 0x2A6D7)
    /* U+20000 to U+2A6D6 - CJK Ideograph Extension B  - simple */
    return simp;
  else if(c < 0x2F800)
    /* U+2A6D8 to U+2F7FF - codes do not exist */
    return NoNo;
  else if(c < 0x2FA1E)
    /* U+2F800 to U+2FA1D - CJK Compatibility Ideographs Supplement - forbidden/excluded in NFC */
    /* FIXME Unicode 4 says to 2FA1F */
    return NOFC;
  else if(c == 0xE0001)
    /* U+E0001 - "Language Tag" - simple */
    return simp;
  else if(c < 0xE0020)
    /* U+E0002 to U+E001F - codes do not exist */
    return NoNo;
  else if(c < 0xE0080)
    /* U+E0020 to U+E007F - Tag components - simple */
    return simp;
  else if(c < 0xE0100)
    /* U+E0080 to U+E00FF - codes do not exist */
    return NoNo;
  else if(c < 0xE01F0)
    /* U+E0100 to U+E01EF - Variation Selectors Supplement - simple */
    return simp;
  else
    /* otherwise does not exist/forbidden */
    return NoNo;
}



#ifdef RAPTOR_DEBUG_NFC_CHECK
#define RAPTOR_NFC_CHECK_FAIL(char, reason) do { fprintf(stderr, "%s:%d:%s: NFC check failed on U+%04lX: " reason "\n", __FILE__, __LINE__, __func__, char); } while(0)
#else
#define RAPTOR_NFC_CHECK_FAIL(char, reason)
#endif


/**
 * raptor_nfc_check:
 * @input: UTF-8 string
 * @length: length of string
 * @errorp: pointer to store offset of character in error (or NULL)
 *
 * Unicode Normal Form C (NFC) check function.
 *
 * If errorp is not NULL, it is set to the offset of the character
 * in error in the buffer, or <0 if there is no error.
 * 
 * Return value: Non 0 if the string is NFC
 **/
int
raptor_nfc_check (const unsigned char* string, size_t len, int *error)
{
  const unsigned char* start;
  int is_start;
  size_t offset;
  
  raptor_nfc_code_flag prev_char_flag=(raptor_nfc_code_flag)0;
  unsigned long prev_char=0L;
  int prev_class;
  
  start=string;
  is_start=1;
  offset=0;
  prev_class=0;

  while(offset < len) {
    unsigned long unichar;
    int unichar_len;
    int combining_class=0;
    raptor_nfc_code_flag flag;
    
    unichar_len=raptor_utf8_to_unicode_char(&unichar, string, len);
    if(unichar_len < 0 || unichar_len > (int)len) {
      /* UTF-8 encoding had an error or ended in the middle of a string */
      if(error)
        *error=offset;
      RAPTOR_NFC_CHECK_FAIL(unichar, "UTF-8 decoding error");
      return 0;
    }
    string += unichar_len; 
    offset += unichar_len;
    
    len -= unichar_len;

    flag = raptor_nfc_get_code_flag(unichar);

    switch (flag) {
      case HIGH:
      case loww:
      case NOFC:
        /* Forbidden combinations:
         * HIGH: high surrogate
         *
         * loww: low surrogate
         *
         * NOFC: Either singleton or excluded.  Exclusions are given in:
         *   http://www.unicode.org/unicode/reports/tr15/#Primary_Exclusion_List_Table
         *   http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
         */
        if(error)
          *error=offset;
        RAPTOR_NFC_CHECK_FAIL(unichar, "forbidden combinations - HIGH, loww, NOFC");
        return 0;
        

      case NoNo:
        /* character does not exist */
        if(error)
          *error=offset;
        RAPTOR_NFC_CHECK_FAIL(unichar, "NoNo - character does not exist");
        return 0;


      case ReCo:
        /* class > 0 and recombining */

        /* class > 0 are forbidden at start */
        if(is_start) {
          if(error)
            *error=offset;
          RAPTOR_NFC_CHECK_FAIL(unichar, "ReCo at start");
          return 0;
        }
        combining_class=raptor_nfc_get_class(unichar);

        /* check 1 - previous class later than current, always an error */
        if(prev_class > combining_class) {
          if(error)
            *error=offset;
          RAPTOR_NFC_CHECK_FAIL(unichar, "ReCo and prev class > current");
          return 0;
        }
        
        /* check 2 - previous class same as current - always OK */

        /* check 3 - previous class earlier than current class.
         * Only perform combining check when both are in range
         */
        if(prev_class < combining_class && 
           prev_char < 0x10000 && unichar < 0x10000 &&
           raptor_nfc_check_combiners(prev_char, unichar)) {
          if(error)
            *error=offset;
          RAPTOR_NFC_CHECK_FAIL(unichar, "ReCo and combiners check failed");
          return 0;
        }

        break;


      case NoRe:
        /* class >0, not recombining */

        /* class > 0 are forbidden at start */
        if(is_start)  {
          if(error)
            *error=offset;
          RAPTOR_NFC_CHECK_FAIL(unichar, "NoRe at start");
          return 0;
        }

        combining_class=raptor_nfc_get_class(unichar);
        if(prev_class > combining_class) {
          if(error)
            *error=offset;
          RAPTOR_NFC_CHECK_FAIL(unichar, "NoRe and prev class > current");
          return 0;
        }

        break;


      case COM0:
        /* class is 0 and composing */

        /* Composing characters forbidden at start */
        if(is_start)  {
          if(error)
            *error=offset;
          RAPTOR_NFC_CHECK_FAIL(unichar, "COMB at start");
          return 0;
        }
        
        /* Only perform combining check when both are in range */
        if(prev_char < 0x10000 && unichar < 0x10000 && 
           raptor_nfc_check_combiners(prev_char, unichar)) {
          if(error)
            *error=offset;
          RAPTOR_NFC_CHECK_FAIL(unichar, "COMB and combiners check failed");
          return 0;
        }

        combining_class=0;
        break;


      case Hang:
        /* hangul Jamo (Korean) initial consonants */

        combining_class=0;
        break;


      case hAng:
        /* Hangul Jamo (Korean) medial vowels 
         * Must not be at the start and must not follow
         * a Hangul initial consonant
         */
        if(is_start || prev_char_flag == Hang) {
          if(error)
            *error=offset;
          RAPTOR_NFC_CHECK_FAIL(unichar, "hAng at start");
          return 0;
        }

        combining_class=0;
        break;


      case haNG:  
        /* hangul trailing consonants 
         * Must not be at the start and must not follow a
         * hangul initial/medial syllable
         */
        if(is_start || prev_char_flag == HAng) {
          if(error)
            *error=offset;
          RAPTOR_NFC_CHECK_FAIL(unichar, "haNG at start");
          return 0;
        }

        combining_class=0;
        break;


      case HAng:
        /* Hangul Jamo (Korean) initial/medial syllables */

        combining_class=0;
        break;


      case Base:
        /* base that combines */
      case simp:
        /* simple characters */

        combining_class=0;
        break;
    }

    prev_char=unichar;
    prev_char_flag=flag;
    prev_class=combining_class;

    is_start=0;
  }

  return 1;
}