The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.

/* $Id: getcode.c 5404 2008-02-01 05:07:01Z hio $ */

#include "Japanese.h"
#include "getcode.h"

#include <string.h>

#ifndef dAX
/* 5.6.x? */
#define dAX I32 ax = MARK - PL_stack_base + 1
#endif

#define PERL_PATCHLEVEL_H_IMPLICIT
#include "patchlevel.h"

#if !defined(PERL_VERSION)  && defined(PATCHLEVEL)
/* 5.005_xx and prior */
#define PERL_REVISION   5
#define PERL_VERSION    PATCHLEVEL
#define PERL_SUBVERSION SUBVERSION
#endif

#if PERL_VERSION <= 4 && !defined(PL_stack_base)
/* 5.004 */
extern SV ** Perl_stack_base;
#define PL_stack_base Perl_stack_base
#endif

#define GC_DISP 0

#ifndef __cplusplus
#undef bool
#undef true
#undef false
typedef enum bool { false, true, } bool;
#endif

/* ʸ»ú¥³¡¼¥ÉÄê¿ô */
enum charcode_t
{
  cc_unknown,
  cc_ascii,
  cc_sjis,
  cc_eucjp,
  cc_jis_au,
  cc_jis_jsky,
  cc_jis,
  cc_utf8,
  cc_utf16,
  cc_utf32,
  cc_utf32_be,
  cc_utf32_le,
  cc_sjis_jsky,
  cc_sjis_au,
  cc_sjis_imode,
  cc_sjis_doti,
  cc_last,
};
typedef enum charcode_t charcode_t;

/* ʸ»ú¥³¡¼¥É̾ʸ»úÎó(SV*) */
#define new_CC_UNKNOWN()  newSVpvn("unknown", 7)
#define new_CC_ASCII()    newSVpvn("ascii",   5)
#define new_CC_SJIS()     newSVpvn("sjis",    4)
#define new_CC_JIS_AU()   newSVpvn("jis-au",  6)
#define new_CC_JIS_JSKY() newSVpvn("jis-jsky",8)
#define new_CC_JIS()      newSVpvn("jis",     3)
#define new_CC_EUCJP()    newSVpvn("euc",     3)
#define new_CC_UTF8()     newSVpvn("utf8",    4)
#define new_CC_UTF16()    newSVpvn("utf16",   5)
#define new_CC_UTF32()    newSVpvn("utf32",   5)
#define new_CC_UTF32_BE() newSVpvn("utf32-be",8)
#define new_CC_UTF32_LE() newSVpvn("utf32-le",8)
#define new_CC_SJIS_JSKY()  newSVpvn("sjis-jsky",9)
#define new_CC_SJIS_IMODE() newSVpvn("sjis-imode",10)
#define new_CC_SJIS_DOTI()  newSVpvn("sjis-doti",9)
#define new_CC_SJIS_AU()    newSVpvn("sjis-au",7)

/* */
#define RE_BOM2_BE  "\xfe\xff"
#define RE_BOM2_LE  "\xff\xfe"
#define RE_BOM4_BE  "\x00\x00\xfe\xff"
#define RE_BOM4_LE  "\xff\xfe\x00\x00"

#if defined(TEST) && GC_DISP
/* ʸ»ú¥³¡¼¥ÉÄê¿ô¤òʸ»ú¥³¡¼¥É̾¤Ë. */
static const char* charcodeToStr(charcode_t code)
{
  switch(code)
  {
  case cc_unknown:  return "unknown";
  case cc_ascii:    return "ascii";
  case cc_sjis:     return "sjis";
  case cc_eucjp:    return "eucjp";
  case cc_jis_au:   return "jis-au";
  case cc_jis_jsky: return "jis-jsky";
  case cc_jis:      return "jis";
  case cc_utf8:     return "utf8";
  case cc_utf32:    return "utf32";
  case cc_utf32_be: return "utf32-be";
  case cc_utf32_le: return "utf32-le";
  case cc_sjis_jsky:  return "sjis-jsky";
  case cc_sjis_imode: return "sjis-imode";
  case cc_sjis_doti:  return "sjis-doti";
  case cc_sjis_au:    return "sjis-au";
  default: return NULL;
  }
}
#endif
#ifdef TEST
DECL_MAP_MODE(ascii,1) = { "ascii", };
DECL_MAP_MODE(eucjp,5) =
{ "eucjp", "0212:3.1","0212:3.2","c:2.1","kana:2.1",};
DECL_MAP_MODE(sjis,2) = { "sjis","c:2.1", };
DECL_MAP_MODE(jis,11) =
{
  "jis","jis#1","jis#2","jis#3","jis#4","jis#5","jis#6",
  "jis#7","jis#loop1","jis#loop2","jis#kana",
};
DECL_MAP_MODE(jis_au,12) =
{
  "jis","jis#1","jis#2","jis#3","jis#4","jis#5","jis#6",
  "jis#7","jis#loop1","jis#loop2","jis#kana","jis#au",
};
DECL_MAP_MODE(jis_jsky,13) =
{
  "jis","jis#1","jis#2","jis#3","jis#4","jis#5","jis#6",
  "jis#7","jis#loop1","jis#loop2","jis#kana","jis#j2","jis#jend",
};
DECL_MAP_MODE(utf8,6) = 
{
  "utf8",
  "u8:6.1","u8:6.2","u8:6.3","u8:6.4","u8:6.5",
};
DECL_MAP_MODE(utf32_be,4) = 
{
  "utf32-be","utf32-be:4:1","utf32-be:4:2","utf32-be:4:3",
};
DECL_MAP_MODE(utf32_le,4) = 
{
  "utf32-le","utf32-le:4:1","utf32-le:4:2","utf32-le:4:3",
};
DECL_MAP_MODE(sjis_jsky,5) =
{
  "sjis","c:2.1",
  "jsky:start:1","jsky:start:2","jsky:code1",
};
DECL_MAP_MODE(sjis_imode,4) =
{
  "sjis","c:2.1",
  "imode1:1","imode2:1",
};
DECL_MAP_MODE(sjis_doti,7) =
{
  "sjis","c:2.1",
  "doti1:1", "doti2:1", "doti3:1", "doti4:1", "doti5:1",
};
DECL_MAP_MODE(sjis_au,3) =
{
  "sjis","c:2.1",
  "au:1",
};
#endif

/* ʸ»ú¥³¡¼¥ÉȽÄê»þ¤Ë»ÈÍѤ¹¤ë¹½Â¤ÂÎ. */
struct CodeCheck
{
  charcode_t code;
  const unsigned char* base;
  const unsigned char* table;
#ifdef TEST
  const char** msg;
#endif
};
typedef struct CodeCheck CodeCheck;

/* ʸ»ú¥³¡¼¥ÉȽÄê¤Î½é´ü¾õÂÖ. */
#ifndef TEST
#define GEN_CODE(name) \
  { cc_##name, (const unsigned char*)map_##name, (const unsigned char*)map_##name, }
#else
#define GEN_CODE(name) \
  { cc_##name, (const unsigned char*)map_##name, (const unsigned char*)map_##name, mode_##name, }
#endif
#define cc_tmpl_max 13
const CodeCheck cc_tmpl[cc_tmpl_max] = 
{
  GEN_CODE(utf32_be),
  GEN_CODE(utf32_le),
  GEN_CODE(ascii),
  GEN_CODE(jis),
  GEN_CODE(jis_au),
  GEN_CODE(jis_jsky),
  GEN_CODE(eucjp),
  GEN_CODE(sjis),
  GEN_CODE(sjis_jsky),
  GEN_CODE(sjis_imode),
  GEN_CODE(sjis_au),
  GEN_CODE(sjis_doti),
  GEN_CODE(utf8),
};

/* ȽÄê·ë²Ì¤Î¹½Â¤ÂÎ. */
struct CodeResult
{
  charcode_t code;
  int begin;
  int len;
};
typedef struct CodeResult CodeResult;

static bool _is_acceptable_state(const CodeCheck* check)
{
  /* special cases. */
  if( check->table==map_jis_jsky[11] )
  { /* jis-jsky, jis#j2 */
    return true;
  }
  if( check->table==map_sjis_jsky[4] )
  { /* sjis-jsky, sjis#j2 */
    return true;
  }
  return false;
}

static int getcode_list(SV* sv_str, CodeCheck* check)
{
  unsigned char* src;
  STRLEN len;
  const unsigned char* src_end;
  int cc_max;
  
  if( sv_str==&PL_sv_undef )
  {
    return 0;
  }
  if( SvGMAGICAL(sv_str) )
  {
    mg_get(sv_str);
  }
  if( !SvOK(sv_str) )
  {
    return 0;
  }
  
  src = (unsigned char*)SvPV(sv_str, len);
  src_end = src+len;
  
  /* empty string */
  /* (jp:) ¶õʸ»úÎó¤Ï unknown */
  if( len==0 )
  {
    return 0;
  }
  
  /* BOM of UTF32 */
  if( (len%4)==0 && len>=4 &&
      ( memcmp(src,RE_BOM4_BE,4)==0 || memcmp(src,RE_BOM4_LE,4)==0 ) )
  {
    check[0].code = cc_utf32;
    return 1;
  }
  
  /* BOM of UTF16 */
  if( (len%2)==0 && len>=2 &&
      ( memcmp(src,RE_BOM2_BE,2)==0 || memcmp(src,RE_BOM2_LE,2)==0 ) )
  {
    check[0].code = cc_utf16;
    return 1;
  }

  /* fprintf(stderr,"Unicode::Japanese::(xs)getcode[%d]\n",len); */
  /* fprintf(stderr,">>%s<<\n",src); */
  /* bin_dump("in ",src,len); */

  memcpy(check,cc_tmpl,sizeof(cc_tmpl));
  cc_max = cc_tmpl_max;

  for( ; src<src_end; ++src )
  {
    int invalids;
    int i;
#if TEST && GC_DISP
    fprintf(stderr,"[%d] '%c' 0x%02x (%d)\n",len-(src_end-src),(0x20<=*src&&*src<=0x7f?*src:'.'),*src,*src);
#endif
    /* Á«°Ü¤ò£±¤Ä¿Ê¤á¤ë¡Á */
    invalids = 0;
    for( i=0; i<cc_max; ++i )
    {
      int nxt = check[i].table[*src];
#if TEST && GC_DISP
      fprintf(stderr,"  %s : %d (%s)\n",charcodeToStr(check[i].code),nxt,nxt!=map_invalid?check[i].msg[nxt]:"invalid");
#endif
      if( nxt!=map_invalid )
      {
	check[i].table = check[i].base+nxt*256;
      }else
      {
	++invalids;
	check[i].table = NULL;
      }
    }
    if( invalids==0 )
    { /* Á´Éô·Ñ³ */
      continue;
    }else if( cc_max-invalids>0 )
    { /* ¤Þ¤À¤¢¤ê¡Á */
      int rd = 0;
      int wr = 0;
      for( ;rd<cc_max; ++rd )
      {
	if( check[rd].table )
	{
	  if( rd!=wr )
	  {
	    check[wr] = check[rd];
	  }
	  ++wr;
	}
      }
      cc_max = wr;
    }else
    { /* Á´Éô¤À¤á¡Á */
      return 0;
    }
  }

  /* check if we have stopped at a valid (final?) state */
  {
    int wr = 0;
    int i;
    for( i=0; i<cc_max; ++i )
    {
      if( check[i].table == check[i].base || _is_acceptable_state(&check[i]) )
      {
        if( wr!=i )
	{
	  check[wr] = check[i];
	}
	++wr;
      }
    }
    cc_max = wr;
  }

#if TEST && GC_DISP
  fprintf(stderr,"<availables>\n");
  {
    int i;
    for( i=0; i<cc_max; ++i )
    {
      fprintf(stderr,"  %s\n",charcodeToStr(check[i].code));
    }
  }
#endif
  
  return cc_max;
}

#ifndef NO_XSUBS

/* getcode´Ø¿ô */
SV* xs_getcode(SV* sv_str)
{
  int matches;
  CodeCheck check[cc_tmpl_max];
  
  if( sv_str==&PL_sv_undef )
  {
    return new_SV_UNDEF();
  }
  if( SvGMAGICAL(sv_str) )
  {
    mg_get(sv_str);
  }
  if( !SvOK(sv_str) )
  {
    return newSVsv(&PL_sv_undef);
  }
  matches = getcode_list(sv_str, check);
  if( matches>0 )
  {
    int index = 0;
#if TEST && GC_DISP
    fprintf(stderr,"<selected>\n");
    fprintf(stderr,"  %d of 0..%d\n",index,matches-1);
    fprintf(stderr,"  %s\n",charcodeToStr(check[index].code));
#endif
    switch(check[index].code)
    {
    case cc_unknown:    return new_CC_UNKNOWN();
    case cc_ascii:      return new_CC_ASCII();
    case cc_sjis:       return new_CC_SJIS();
    case cc_eucjp:      return new_CC_EUCJP();
    case cc_jis:        return new_CC_JIS();
    case cc_jis_au:     return new_CC_JIS_AU();
    case cc_jis_jsky:   return new_CC_JIS_JSKY();
    case cc_utf8:       return new_CC_UTF8();
    case cc_utf16:      return new_CC_UTF16();
    case cc_utf32:      return new_CC_UTF32();
    case cc_utf32_be:   return new_CC_UTF32_BE();
    case cc_utf32_le:   return new_CC_UTF32_LE();
    case cc_sjis_jsky:  return new_CC_SJIS_JSKY();
    case cc_sjis_imode: return new_CC_SJIS_IMODE();
    case cc_sjis_doti:  return new_CC_SJIS_DOTI();
    case cc_sjis_au:    return new_CC_SJIS_AU();
    
    default:
#ifdef TEST
      return NULL;
#else
      return new_CC_UNKNOWN();
#endif
    }
  }else
  {
    return new_CC_UNKNOWN();
  }
}

/* getcode_list´Ø¿ô */
int xs_getcode_list(SV* sv_str)
{
  int matches;
  CodeCheck check[cc_tmpl_max];
  int i;
  dSP; dMARK; dAX; /* XSARGS; - items */
  
  if( sv_str==&PL_sv_undef )
  {
    return 0;
  }
  if( SvGMAGICAL(sv_str) )
  {
    mg_get(sv_str);
  }
  if( !SvOK(sv_str) )
  {
    return 0;
  }
  matches = getcode_list(sv_str, check);
  if( matches<=0 )
  {
    return 0;
  }
  EXTEND(SP, matches);
  for( i=0; i<matches; ++i )
  {
    switch(check[i].code)
    {
    case cc_unknown:    ST(i) = sv_2mortal( new_CC_UNKNOWN()    ); break;
    case cc_ascii:      ST(i) = sv_2mortal( new_CC_ASCII()      ); break;
    case cc_sjis:       ST(i) = sv_2mortal( new_CC_SJIS()       ); break;
    case cc_eucjp:      ST(i) = sv_2mortal( new_CC_EUCJP()      ); break;
    case cc_jis:        ST(i) = sv_2mortal( new_CC_JIS()        ); break;
    case cc_jis_au:     ST(i) = sv_2mortal( new_CC_JIS_AU()     ); break;
    case cc_jis_jsky:   ST(i) = sv_2mortal( new_CC_JIS_JSKY()   ); break;
    case cc_utf8:       ST(i) = sv_2mortal( new_CC_UTF8()       ); break;
    case cc_utf16:      ST(i) = sv_2mortal( new_CC_UTF16()      ); break;
    case cc_utf32:      ST(i) = sv_2mortal( new_CC_UTF32()      ); break;
    case cc_utf32_be:   ST(i) = sv_2mortal( new_CC_UTF32_BE()   ); break;
    case cc_utf32_le:   ST(i) = sv_2mortal( new_CC_UTF32_LE()   ); break;
    case cc_sjis_jsky:  ST(i) = sv_2mortal( new_CC_SJIS_JSKY()  ); break;
    case cc_sjis_imode: ST(i) = sv_2mortal( new_CC_SJIS_IMODE() ); break;
    case cc_sjis_doti:  ST(i) = sv_2mortal( new_CC_SJIS_DOTI()  ); break;
    default:            ST(i) = sv_2mortal( new_CC_UNKNOWN()    ); break;
    }
  }
  return matches;
}

#endif

/* ----------------------------------------------------------------------------
 * End of File.
 * ------------------------------------------------------------------------- */