The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* 
 * $Id: Map.xs,v 1.28 1998/03/23 23:57:46 schwartz Exp $
 *
 * ALPHA version
 *
 * Unicode::Map - C extensions
 *
 * Interface documentation at Map.pm
 *
 * Copyright (C) 1998, 1999, 2000 Martin Schwartz. All rights reserved.
 * This program is free software; you can redistribute it and/or
 * modify it under the same terms as Perl itself.
 *
 * Contact: Martin Schwartz <martin@nacho.de>
 */

#ifdef __cplusplus
extern "C" {
#endif
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#ifdef __cplusplus
}
#endif

/* 
 * It seems that dowarn isn't defined on some systems, PL_dowarn not on 
 * others. Gisle Aas deals with it this way:
 */
#include "patchlevel.h"
#if PATCHLEVEL <= 4 && !defined(PL_dowarn)
   #define PL_dowarn dowarn
#endif

/*
 *
 * "Map.h"
 *
 */

#define M_MAGIC               0xb827 /* magic word */
#define MAP8_BINFILE_MAGIC_HI 0xfffe /* magic word for Gisle's file format */
#define MAP8_BINFILE_MAGIC_LO 0x0001 /* */

#define M_END   0       /* end */
#define M_INF   1       /* infinite subsequent entries (default) */
#define M_BYTE  2       /* 1..255 subsequent entries  */
#define M_VER   4       /* (Internal) file format revision. */
#define M_AKV   6       /* key1, val1, key2, val2, ... (default) */
#define M_AKAV  7       /* key1, key2, ..., val1, val2, ... */
#define M_PKV   8       /* partial key value mappings */
#define M_CKn   10      /* compress keys not */
#define M_CK    11      /* compress keys (default) */
#define M_CVn   13      /* compress values not */
#define M_CV    14      /* compress values (default) */

#define I_NAME  20      /* Info: (wstring) Character Set Name */
#define I_ALIAS 21      /* Info: (wstring) Charset alias (several entries ok) */
#define I_VER   22      /* Info: (wstring) Mapfile revision */
#define I_AUTH  23 	/* Info: (wstring) Mapfile authRess */
#define I_INFO  24      /* Info: (wstring) Some userEss definable string */

#define T_BAD   0	/* Type: unknown */
#define T_MAP8  1	/* Type: Map8 style */
#define T_MAP   2	/* Type: Map style */

#define num1_DEFAULT    M_INF;
#define method1_DEFAULT M_AKV;
#define keys1_DEFAULT   M_CK;
#define values1_DEFAULT M_CV;

/* No function prototypes (as very old C-Compilers don't like them) */

/*
 *
 * "Map.c"
 *
 */

U8  _byte(char** buf) { 
   U8* tmp = (U8*) *buf; *buf+=1; return tmp[0]; 
}
U16 _word(char** buf) {
   U16 tmp; memcpy ((char*) &tmp, *buf, 2); *buf+=2; return ntohs(tmp);
}
U32 _long(char** buf) {
   U32 tmp; memcpy ((char*) &tmp, *buf, 4); *buf+=4; return ntohl(tmp);
}

AV* __system_test (void) {
/*
 * If this test suit gets passed ok, the C methods will probably work.
 */
   char* check = "\x01\x04\xfe\x83\x73\xf8\x04\x59\x19";
   char* buf;
   AV*   list = newAV();
   U32   i, k;

   /*
    * Have the Unn the bytesize I assume?
    */
   if (sizeof(U8)!=1)  { av_push (list, newSVpv("1a", 1)); }
   if (sizeof(U16)!=2) { av_push (list, newSVpv("1b", 1)); }
   if (sizeof(U32)!=4) { av_push (list, newSVpv("1c", 1)); }
   
   /*
    * Does _byte work?
    */
   buf = check;                   
   if (_byte(&buf) != 0x01) { av_push(list, newSVpv("2a", 2)); }
   if (_byte(&buf) != 0x04) { av_push(list, newSVpv("2b", 2)); }
   if (_byte(&buf) != 0xfe) { av_push(list, newSVpv("2c", 2)); }
   if (_byte(&buf) != 0x83) { av_push(list, newSVpv("2d", 2)); }
   
   /*
    * Are _word and _long really reading Network order?
    */
   if (_word(&buf) != 0x73f8)     { av_push(list, newSVpv("3a", 2)); }
   if (_word(&buf) != 0x0459)     { av_push(list, newSVpv("3b", 2)); }
   buf = check + 1;
   if (_byte(&buf) != 0x04)       { av_push(list, newSVpv("4a", 2)); }
   if (_long(&buf) != 0xfe8373f8) { av_push(list, newSVpv("4b", 2)); }
   
   /*
    * Is U32 really not an I32?
    */
   buf = check + 2;
   i = _long(&buf);
   i ++;
   if (i != 0xfe8373f9) { av_push(list, newSVpv("5", 1)); }
   
   k = htonl(0x12345678);
   if (memcmp((char*)&k+(4-1), "\x78", 1)) { 
      av_push(list, newSVpv("6a", 2)); 
   }
   if (memcmp((char*)&k+(4-2), "\x56\x78", 2)) { 
      av_push(list, newSVpv("6b", 2)); 
   }
   if (memcmp((char*)&k+(4-4), "\x12\x34\x56\x78", 4)) { 
      av_push(list, newSVpv("6c", 2)); 
   }

   return (list);
}

int
__limit_ol (SV* string, SV* o, SV* l, char** ro, U32* rl, U16 cs) {
/*
 * Checks, if offset and length are valid. If offset is negative, it is
 * treated like a negative offset in perl.
 *
 * When successful, sets ro (real offset) and rl (real length).
 */
   STRLEN  slen;
   char*   address;
   I32     offset;
   U32     length;

   *ro = 0;
   *rl = 0;

   if (!SvOK(string)) {
      if (PL_dowarn) { warn ("String undefined!"); }
      return (0);
   }

   address = SvPV (string, slen);
   offset  = SvOK(o) ? SvIV(o) : 0;
   length  = SvOK(l) ? SvIV(l) : slen;

   if (offset < 0) {
      offset += slen;
   }

   if (offset < 0) {
      offset = 0;
      length = slen;
      if (PL_dowarn) { warn ("Bad negative string offset!"); }
   }

   if (offset > slen) {
      offset = slen;
      length = 0;
      if (PL_dowarn) { warn ("String offset to big!"); }
   }

   if (offset + length > slen) {
      length = slen - offset;
      if (PL_dowarn) { warn ("Bad string length!"); }
   }

   if (length % cs != 0) {
      if (length>cs) {
         length -= (length % cs);
      } else {
         length = 0;
      }
      if (PL_dowarn) { warn("Bad string size!"); }
   }

   *ro = address + offset;
   *rl = length;

   return (1);
}

int
__get_mode (char** buf, U8* num, U8* method, U8* keys, U8* values) {
   U8 type, size;

   type = _byte(buf);
   size = _byte(buf); *buf += size;

   switch (type) {
      case M_INF:
      case M_BYTE:
         *num = type; break;
      case M_AKV:
      case M_AKAV:
      case M_PKV:
         *method = type; break;
      case M_CKn:
      case M_CK:
         *keys = type; break;
      case M_CVn:
      case M_CV:
         *values = type; break;
   }
   return (type);
}

/*
 *  void = __read_binary_mapping (bufS, oS, UR, CR)
 *
 *  Table of mode combinations:
 *  
 *  Mode      | n1  n2  | INF  BYTE  |  CK  CKn  |  CV  CVn
 *  ---------------------------------------------------------
 *  AKV       |         |            |           |
 *  AKAV      |         |            |           |
 *  PKV   ok  | ==1 ==1 |      ok    |  ok       |  ok
 */
int
__read_binary_mapping (SV* bufS, SV* oS, SV* UR, SV* CR) {
   char* buf;
   U32 o;
   HV* U; SV* uR; HV* u;
   HV* C; SV* cR; HV* c;
   
   int   buflen;
   char* bufmax;
   U8    cs1, cs1b, cs2, cs2b;
   U32   n1, n2;
   U16   check;
   U16   type=T_BAD;
   U8    num1, method1, keys1, values1;
   I16   kn, vn;
   U32   kbegin, vbegin;
   SV*   Ustr;
   SV*   Cstr;
   SV**  tmp_spp;
   
   buf =        SvPVX (bufS);
   o   =        SvIV (oS);
   U   = (HV *) SvRV (UR);
   C   = (HV *) SvRV (CR);

   buflen = SvCUR(bufS); if (buflen < 2) { 
      /*
       * Too short file. (No place for magic)
       */
      if ( PL_dowarn ) { warn ( "Bad map file: too short!" ); }
      return (0); 
   }
   bufmax = buf + buflen;
   buf += o;
   check = _word(&buf);

   if (check == M_MAGIC) {
      type = T_MAP;
   } else if (
      ( check == MAP8_BINFILE_MAGIC_HI ) &&
      ( _word(&buf) == MAP8_BINFILE_MAGIC_LO )
   ) {
      type = T_MAP8;
   }

   if (type == T_BAD) {
      if ( PL_dowarn ) { warn ( "Unknown map file format!" ); }
      return (0);
   }

   num1    = num1_DEFAULT;
   method1 = method1_DEFAULT;
   keys1   = keys1_DEFAULT;
   values1 = values1_DEFAULT;

   while (buf<bufmax) {
      U8 num2, method2, keys2, values2;
      num2=num1; method2=method1; keys2=keys1; values2=values1;

      if (type == T_MAP) {
         cs1 = _byte (&buf);
         if (!cs1) {
            if (__get_mode(&buf, &num1, &method1, &keys1, &values1) == M_END) {
               break;
            }
            continue;
         } else {
            n1  = _byte (&buf);
            cs2 = _byte (&buf);
            n2  = _byte (&buf);
         }
         cs1b = (cs1+7)/8;
         cs2b = (cs2+7)/8;
      } else if (type == T_MAP8) {
         cs1b=1; n1=1; cs2b=2; n2=1;
      }

      Ustr = newSVpvf ("%d,%d,%d,%d", cs1b, n1, cs2b, n2);
      Cstr = newSVpvf ("%d,%d,%d,%d", cs2b, n2, cs1b, n1);

      /*
       * Get, create hash for submapping of %U
       */
      if (!hv_exists_ent(U, Ustr, 0)) {
         hv_store_ent(U, Ustr, newRV_inc((SV*) newHV()), 0);
      }
      tmp_spp = hv_fetch(U, SvPVX(Ustr), SvCUR(Ustr), 0);
      if (!tmp_spp) {
         if ( PL_dowarn ) { warn ( "Can't retrieve U submapping!" ); }
         return (0);
      } else {
         uR = (SV *) *tmp_spp;
         u  = (HV *) SvRV (uR);
      }

      /*
       * Get, create hash for submapping of %C
       */
      if (!hv_exists_ent(C, Cstr, 0)) {
         hv_store_ent(C, Cstr, newRV_inc((SV*) newHV()), 0);
      }
      tmp_spp = hv_fetch(C, SvPVX(Cstr), SvCUR(Cstr), 0);
      if (!tmp_spp) {
         if ( PL_dowarn ) { warn ( "Can't retrieve C submapping!" ); }
         return (0);
      } else {
         cR = (SV *) *tmp_spp;
         c  = (HV *) SvRV (cR);
      }

      if (type == T_MAP8) {
      /*
       * Map8 mode
       */
         /*
          * => All (key, value) pairs
          */
         SV* tmpk; SV* tmpv;
         while (buf<bufmax) {
            if (buf[0] != '\0') {
               if ( PL_dowarn ) { warn ( "Bad map file!" ); }
               return (0);
            }
            tmpk = newSVpv(buf+1, 1); buf += 2;
            tmpv = newSVpv(buf  , 2); buf += 2;
            if (buf > bufmax) { break; }

            hv_store_ent(u, tmpk, tmpv, 0);
            hv_store_ent(c, tmpv, tmpk, 0);
         }
      } else if (method1==M_AKV) {
      /*
       * Map mode
       */
         U32 ksize = n1*cs1b; SV* tmpk;
         U32 vsize = n2*cs2b; SV* tmpv;
         if ( num1==M_INF ) {
            /*
             * All (key, value) pairs
             */
            while (buf<bufmax) {
               if ( buf+ksize+vsize>bufmax ) {
                  buf += ( ksize+vsize );
                  break;
               }
               tmpk = newSVpv(buf, ksize); buf += ksize;
               tmpv = newSVpv(buf, vsize); buf += vsize;
               hv_store_ent(c, tmpv, tmpk, 0);
               hv_store_ent(u, tmpk, tmpv, 0);
            }
         } else if ( num1==M_BYTE ) {
            while ( buf<bufmax ) {
               if (!(kn=_byte(&buf))) { 
                  if (__get_mode(&buf,&num2,&method2,&keys2,&values2)==M_END) {
                     break;
                  }
               }
               while ( kn>0 ) {
                  if ( buf+ksize+vsize>bufmax ) {
                     buf += ( ksize+vsize );
                     break;
                  }
                  tmpk = newSVpv(buf, ksize); buf += ksize;
                  tmpv = newSVpv(buf, vsize); buf += vsize;
                  hv_store_ent(c, tmpv, tmpk, 0);
                  hv_store_ent(u, tmpk, tmpv, 0);
                  kn--;
               }
            }
         }
      } else if (method1==M_AKAV) {
         /*
          * First all keys, then all values
          */
         if ( PL_dowarn ) { warn ( "M_AKAV not supported!" ); }
         return (0);
      } else if (method1==M_PKV) {
         /*
          * Partial 
          */
         if (num1==M_INF) { 
            /* no infinite mode */
            if ( PL_dowarn ) { warn ( "M_INF not supported for M_PKV!" ); }
            return (0); 
         } 
         while(buf<bufmax) {
            U8 num3, method3, keys3, values3;
            num3=num2; method3=method2; keys3=keys2; values3=values2;
            if (!(kn = _byte(&buf))) { 
               if (__get_mode(&buf,&num2,&method2,&keys2,&values2)==M_END) {
                  break;
               }
               continue;
            }
            switch (cs1b) {
               case 1: kbegin = _byte(&buf); break;
               case 2: kbegin = _word(&buf); break;
               case 4: kbegin = _long(&buf); break;
               default:
                  if ( PL_dowarn ) { warn ( "Unknown element size!" ); }
                  return (0);
            }
            while (kn>0) {
               if (values3==M_CV) {
                  /*
                   * Partial, keys compressed, values compressed
                   */
                  SV* tmpk; U32 k;
                  SV* tmpv; U32 v;
                  U32 max;
                  vn = _byte(&buf);
                  if (!vn) { 
                     if(__get_mode(&buf,&num3,&method3,&keys3,&values3)==M_END){
                        break;
                     }
                     continue;
                  }
                  if ((n1 != 1) || (n2 != 1)) {
                     /*
                      * n (n>1) characters cannot be mapped to one integer
                      */
                     if ( PL_dowarn ) { warn("Bad map file: count mismatch!"); }
                     return (0);
                  }
                  switch (cs2b) {
                     case 1: vbegin = _byte(&buf); break;
                     case 2: vbegin = _word(&buf); break;
                     case 4: vbegin = _long(&buf); break;
                     default: 
                        if ( PL_dowarn ) { warn ( "Unknown element size!" ); }
                        return (0);
                  }

                  max = kbegin + vn;
                  for (; kbegin<max; kbegin++, vbegin++) {
               
                     k = htonl(kbegin);
                     tmpk = newSVpv((char *) &k + (4-cs1b), cs1b);
               
                     v = htonl(vbegin);
                     tmpv = newSVpv((char *) &v + (4-cs2b), cs2b);

                     hv_store_ent(c, tmpv, tmpk, 0);
                     hv_store_ent(u, tmpk, tmpv, 0);
                  }
                  kn-=vn;

               } else if (values3==M_CVn) {
                  /*
                   * Partial, keys compressed, values not compressed
                   */
                  U32 v;
                  U32 vsize = n2*cs2b;
                  SV* tmpk;
                  SV* tmpv;
                  if (n1 != 1) {
                     if ( PL_dowarn ) { warn ( "Bad map file: mismatch 2!" ); }
                     return (0);
                  }
                  while (kn--) {
                     v = htonl(kbegin);
                     tmpk = newSVpv((char *) &v + (4-cs1b), cs1b);
                     tmpv = newSVpv(buf, vsize); buf += vsize;

                     hv_store_ent(u, tmpk, tmpv, 0);
                     hv_store_ent(c, tmpv, tmpk, 0);

                     kbegin++;
                  }
               } else {
               /*
                * Unknown value compression.
                */
                  if ( PL_dowarn ) { warn ( "Unknown compression!" ); }
                  return (0);
               }
            }
         }
      } else {
         /*
          * unknown method
          */
         if ( PL_dowarn ) { warn ( "Unknown method!" ); }
         return (0);
      }
   }

   return (1);
}

/*
 *
 * "Map.xs"
 *
 */

MODULE = Unicode::Map	PACKAGE = Unicode::Map

PROTOTYPES: DISABLE

#
# $text = $Map -> reverse_unicode($text)
#
SV*
_reverse_unicode(Map, text)
        SV*  Map
        SV*  text

        PREINIT:
        int i; 
        char c;
        STRLEN len; 
        char* src; 
        char* dest;

        PPCODE:
	src = SvPV (text, len);
	if (PL_dowarn && (len % 2) != 0) {
    	   warn("Bad string size!"); len--;
	}
        /* Code below adapted from GAAS's Unicode::String */
        if ( GIMME_V == G_VOID ) {
           if ( SvREADONLY(text) ) {
              die ( "reverse_unicode: string is readonly!" );
           }
           dest = src;
        } else {
           SV* dest_sv = sv_2mortal ( newSV(len+1) );
           SvCUR_set ( dest_sv, len );
           *SvEND ( dest_sv ) = 0;
           SvPOK_on ( dest_sv );
           PUSHs ( dest_sv );
           dest = SvPVX ( dest_sv );
        }
        for ( ; len>=2; len-=2 ) {
            char tmp = *src++;
            *dest++ = *src++;
            *dest++ = tmp;
        }

#
# $mapped_str = $Map -> _map_hash($string, \%mapping, $bytesize, offset, length)
#
# bytesize, offset, length in terms of bytes.
#
# bytesize gives the size of one character for this mapping.
#
SV*
_map_hash(Map, string, mappingR, bytesize, o, l)
        SV*  Map
        SV*  string
        SV*  mappingR
        SV*  bytesize
        SV*  o
        SV*  l

        PREINIT:
        char* offset; U32 length; U16 bs;
        char* smax;
        HV*   mapping;
        SV**  tmp;

        CODE:
        bs = SvIV(bytesize);
        __limit_ol (string, o, l, &offset, &length, bs);
        smax = offset + length;

        RETVAL = newSV((length/bs+1)*2);
        mapping = (HV *) SvRV(mappingR);

        for (; offset<smax; offset+=bs) {
           if (tmp = hv_fetch(mapping, offset, bs, 0)) {
              if ( SvOK(RETVAL) ) {
                 sv_catsv(RETVAL, *tmp); 
              } else {
                 sv_setsv(RETVAL, *tmp);
              }
           } else {
              /* No mapping character found! */
           }
        }

        OUTPUT:
	   RETVAL


#
# $mapped_str = $Map -> _map_hashlist($string, [@{\%mapping}], [@{$bytesize}])
#
# bytesize gives the size of one character for this mapping.
#
SV*
_map_hashlist(Map, string, mappingRLR, bytesizeLR, o, l)
        SV*  Map
        SV*  string
        SV*  mappingRLR
        SV*  bytesizeLR
        SV*  o
        SV*  l

        PREINIT:
        int j, max;
        AV* mappingRL; HV* mapping;
        AV* bytesizeL; int bytesize;
        SV** tmp;
        char* offset; U32 length; char* smax; 

        CODE:
        __limit_ol (string, o, l, &offset, &length, 1);
        smax = offset + length;

        RETVAL = newSV((length+1)*2);

	mappingRL = (AV *) SvRV(mappingRLR);
        bytesizeL = (AV *) SvRV(bytesizeLR);
        max = av_len(mappingRL);
        if (max != av_len(bytesizeL)) {
	   warn("$#mappingRL != $#bytesizeL!");
	} else {
           max++;
           for (; offset<smax; ) {
              for (j=0; j<=max; j++) {
                 if (j==max) {
                    /* No mapping character found! 
                     * How many bytes does this unknown character consume?
                     * Sigh, assume 2.
                     */
                    offset += 2;
                 } else {
  	            if (tmp = av_fetch(mappingRL, j, 0)) {
                       mapping = (HV *) SvRV((SV*) *tmp);
                       if (tmp = av_fetch(bytesizeL, j, 0)) {
                          bytesize = SvIV(*tmp);
                          if (tmp = hv_fetch(mapping, offset, bytesize, 0)) {
                             if ( SvOK(RETVAL) ) {
                                sv_catsv(RETVAL, *tmp); 
                             } else {
                                sv_setsv(RETVAL, *tmp);
                             }
                             offset+=bytesize;
                             break;
                          }
                       }
                    }
                 }
              }
           }
        }

        OUTPUT:
	   RETVAL


#
# status = $S->_read_binary_mapping($buf, $o, \%U, \%C);
#
SV*
_read_binary_mapping (MapS, bufS, oS, UR, CR)
	SV* MapS
	SV* bufS
	SV* oS
	SV* UR
	SV* CR

	CODE:
	RETVAL = newSViv(__read_binary_mapping(bufS, oS, UR, CR));

	OUTPUT:
	   RETVAL


#
# 0 || errornum = $S->_test ()
#
AV*
_system_test (void)
	CODE:
	RETVAL = __system_test();
	OUTPUT:
	RETVAL