/*
* tclUtf.c --
*
* Routines for manipulating UTF-8 strings.
*
* Copyright (c) 1997-1998 Sun Microsystems, Inc.
*
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
* RCS: @(#) $Id: tclUtf.c,v 1.30.2.2 2003/10/08 14:21:20 dkf Exp $
*/
#include "tclInt.h"
/*
* Include the static character classification tables and macros.
*/
#include "tclUniData.c"
/*
* The following macros are used for fast character category tests. The
* x_BITS values are shifted right by the category value to determine whether
* the given category is included in the set.
*/
#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
| (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
| (1 << PARAGRAPH_SEPARATOR))
#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
(1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
(1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
(1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
(1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
(1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
/*
* Unicode characters less than this value are represented by themselves
* in UTF-8 strings.
*/
#define UNICODE_SELF 0x80
/*
* The following structures are used when mapping between Unicode (UCS-2)
* and UTF-8.
*/
static CONST unsigned char totalBytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
#if TCL_UTF_MAX > 3
4,4,4,4,4,4,4,4,
#else
1,1,1,1,1,1,1,1,
#endif
#if TCL_UTF_MAX > 4
5,5,5,5,
#else
1,1,1,1,
#endif
#if TCL_UTF_MAX > 5
6,6,6,6
#else
1,1,1,1
#endif
};
/*
* Procedures used only in this module.
*/
static int UtfCount _ANSI_ARGS_((int ch));
/*
*---------------------------------------------------------------------------
*
* UtfCount --
*
* Find the number of bytes in the Utf character "ch".
*
* Results:
* The return values is the number of bytes in the Utf character "ch".
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
INLINE static int
UtfCount(ch)
int ch; /* The Tcl_UniChar whose size is returned. */
{
if ((ch > 0) && (ch < UNICODE_SELF)) {
return 1;
}
if (ch <= 0x7FF) {
return 2;
}
if (ch <= 0xFFFF) {
return 3;
}
#if TCL_UTF_MAX > 3
if (ch <= 0x1FFFFF) {
return 4;
}
if (ch <= 0x3FFFFFF) {
return 5;
}
if (ch <= 0x7FFFFFFF) {
return 6;
}
#endif
return 3;
}
/*
*---------------------------------------------------------------------------
*
* Tcl_UniCharToUtf --
*
* Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
* provided buffer. Equivalent to Plan 9 runetochar().
*
* Results:
* The return values is the number of bytes in the buffer that
* were consumed.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
INLINE int
Tcl_UniCharToUtf(ch, str)
int ch; /* The Tcl_UniChar to be stored in the
* buffer. */
char *str; /* Buffer in which the UTF-8 representation
* of the Tcl_UniChar is stored. Buffer must
* be large enough to hold the UTF-8 character
* (at most TCL_UTF_MAX bytes). */
{
if ((ch > 0) && (ch < UNICODE_SELF)) {
str[0] = (char) ch;
return 1;
}
if (ch <= 0x7FF) {
str[1] = (char) ((ch | 0x80) & 0xBF);
str[0] = (char) ((ch >> 6) | 0xC0);
return 2;
}
if (ch <= 0xFFFF) {
three:
str[2] = (char) ((ch | 0x80) & 0xBF);
str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
str[0] = (char) ((ch >> 12) | 0xE0);
return 3;
}
#if TCL_UTF_MAX > 3
if (ch <= 0x1FFFFF) {
str[3] = (char) ((ch | 0x80) & 0xBF);
str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
str[0] = (char) ((ch >> 18) | 0xF0);
return 4;
}
if (ch <= 0x3FFFFFF) {
str[4] = (char) ((ch | 0x80) & 0xBF);
str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
str[0] = (char) ((ch >> 24) | 0xF8);
return 5;
}
if (ch <= 0x7FFFFFFF) {
str[5] = (char) ((ch | 0x80) & 0xBF);
str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
str[0] = (char) ((ch >> 30) | 0xFC);
return 6;
}
#endif
ch = 0xFFFD;
goto three;
}
/*
*---------------------------------------------------------------------------
*
* Tcl_UniCharToUtfDString --
*
* Convert the given Unicode string to UTF-8.
*
* Results:
* The return value is a pointer to the UTF-8 representation of the
* Unicode string. Storage for the return value is appended to the
* end of dsPtr.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
char *
Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */
int numChars; /* Length of Unicode string in Tcl_UniChars
* (must be >= 0). */
Tcl_DString *dsPtr; /* UTF-8 representation of string is
* appended to this previously initialized
* DString. */
{
CONST Tcl_UniChar *w, *wEnd;
char *p, *string;
int oldLength;
/*
* UTF-8 string length in bytes will be <= Unicode string length *
* TCL_UTF_MAX.
*/
oldLength = Tcl_DStringLength(dsPtr);
Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);
string = Tcl_DStringValue(dsPtr) + oldLength;
p = string;
wEnd = wString + numChars;
for (w = wString; w < wEnd; ) {
p += Tcl_UniCharToUtf(*w, p);
w++;
}
Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
return string;
}
/*
*---------------------------------------------------------------------------
*
* Tcl_UtfToUniChar --
*
* Extract the Tcl_UniChar represented by the UTF-8 string. Bad
* UTF-8 sequences are converted to valid Tcl_UniChars and processing
* continues. Equivalent to Plan 9 chartorune().
*
* The caller must ensure that the source buffer is long enough that
* this routine does not run off the end and dereference non-existent
* memory looking for trail bytes. If the source buffer is known to
* be '\0' terminated, this cannot happen. Otherwise, the caller
* should call Tcl_UtfCharComplete() before calling this routine to
* ensure that enough bytes remain in the string.
*
* Results:
* *chPtr is filled with the Tcl_UniChar, and the return value is the
* number of bytes from the UTF-8 string that were consumed.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
int
Tcl_UtfToUniChar(str, chPtr)
register CONST char *str; /* The UTF-8 string. */
register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented
* by the UTF-8 string. */
{
register int byte;
/*
* Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
*/
byte = *((unsigned char *) str);
if (byte < 0xC0) {
/*
* Handles properly formed UTF-8 characters between 0x01 and 0x7F.
* Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
* characters representing themselves.
*/
*chPtr = (Tcl_UniChar) byte;
return 1;
} else if (byte < 0xE0) {
if ((str[1] & 0xC0) == 0x80) {
/*
* Two-byte-character lead-byte followed by a trail-byte.
*/
*chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
return 2;
}
/*
* A two-byte-character lead-byte not followed by trail-byte
* represents itself.
*/
*chPtr = (Tcl_UniChar) byte;
return 1;
} else if (byte < 0xF0) {
if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {
/*
* Three-byte-character lead byte followed by two trail bytes.
*/
*chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
| ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
return 3;
}
/*
* A three-byte-character lead-byte not followed by two trail-bytes
* represents itself.
*/
*chPtr = (Tcl_UniChar) byte;
return 1;
}
#if TCL_UTF_MAX > 3
else {
int ch, total, trail;
total = totalBytes[byte];
trail = total - 1;
if (trail > 0) {
ch = byte & (0x3F >> trail);
do {
str++;
if ((*str & 0xC0) != 0x80) {
*chPtr = byte;
return 1;
}
ch <<= 6;
ch |= (*str & 0x3F);
trail--;
} while (trail > 0);
*chPtr = ch;
return total;
}
}
#endif
*chPtr = (Tcl_UniChar) byte;
return 1;
}
/*
*---------------------------------------------------------------------------
*
* Tcl_UtfToUniCharDString --
*
* Convert the UTF-8 string to Unicode.
*
* Results:
* The return value is a pointer to the Unicode representation of the
* UTF-8 string. Storage for the return value is appended to the
* end of dsPtr. The Unicode string is terminated with a Unicode
* NULL character.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
Tcl_UniChar *
Tcl_UtfToUniCharDString(string, length, dsPtr)
CONST char *string; /* UTF-8 string to convert to Unicode. */
int length; /* Length of UTF-8 string in bytes, or -1
* for strlen(). */
Tcl_DString *dsPtr; /* Unicode representation of string is
* appended to this previously initialized
* DString. */
{
Tcl_UniChar *w, *wString;
CONST char *p, *end;
int oldLength;
if (length < 0) {
length = strlen(string);
}
/*
* Unicode string length in Tcl_UniChars will be <= UTF-8 string length
* in bytes.
*/
oldLength = Tcl_DStringLength(dsPtr);
Tcl_DStringSetLength(dsPtr,
(int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
w = wString;
end = string + length;
for (p = string; p < end; ) {
p += TclUtfToUniChar(p, w);
w++;
}
*w = '\0';
Tcl_DStringSetLength(dsPtr,
(oldLength + ((char *) w - (char *) wString)));
return wString;
}
/*
*---------------------------------------------------------------------------
*
* Tcl_UtfCharComplete --
*
* Determine if the UTF-8 string of the given length is long enough
* to be decoded by Tcl_UtfToUniChar(). This does not ensure that the
* UTF-8 string is properly formed. Equivalent to Plan 9 fullrune().
*
* Results:
* The return value is 0 if the string is not long enough, non-zero
* otherwise.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
int
Tcl_UtfCharComplete(str, len)
CONST char *str; /* String to check if first few bytes
* contain a complete UTF-8 character. */
int len; /* Length of above string in bytes. */
{
int ch;
ch = *((unsigned char *) str);
return len >= totalBytes[ch];
}
/*
*---------------------------------------------------------------------------
*
* Tcl_NumUtfChars --
*
* Returns the number of characters (not bytes) in the UTF-8 string,
* not including the terminating NULL byte. This is equivalent to
* Plan 9 utflen() and utfnlen().
*
* Results:
* As above.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
int
Tcl_NumUtfChars(str, len)
register CONST char *str; /* The UTF-8 string to measure. */
int len; /* The length of the string in bytes, or -1
* for strlen(string). */
{
Tcl_UniChar ch;
register Tcl_UniChar *chPtr = &ch;
register int i;
/*
* The separate implementations are faster.
*
* Since this is a time-sensitive function, we also do the check for
* the single-byte char case specially.
*/
i = 0;
if (len < 0) {
while (*str != '\0') {
str += TclUtfToUniChar(str, chPtr);
i++;
}
} else {
register int n;
while (len > 0) {
if (UCHAR(*str) < 0xC0) {
len--;
str++;
} else {
n = Tcl_UtfToUniChar(str, chPtr);
len -= n;
str += n;
}
i++;
}
}
return i;
}
/*
*---------------------------------------------------------------------------
*
* Tcl_UtfFindFirst --
*
* Returns a pointer to the first occurance of the given Tcl_UniChar
* in the NULL-terminated UTF-8 string. The NULL terminator is
* considered part of the UTF-8 string. Equivalent to Plan 9
* utfrune().
*
* Results:
* As above. If the Tcl_UniChar does not exist in the given string,
* the return value is NULL.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
CONST char *
Tcl_UtfFindFirst(string, ch)
CONST char *string; /* The UTF-8 string to be searched. */
int ch; /* The Tcl_UniChar to search for. */
{
int len;
Tcl_UniChar find;
while (1) {
len = TclUtfToUniChar(string, &find);
if (find == ch) {
return string;
}
if (*string == '\0') {
return NULL;
}
string += len;
}
}
/*
*---------------------------------------------------------------------------
*
* Tcl_UtfFindLast --
*
* Returns a pointer to the last occurance of the given Tcl_UniChar
* in the NULL-terminated UTF-8 string. The NULL terminator is
* considered part of the UTF-8 string. Equivalent to Plan 9
* utfrrune().
*
* Results:
* As above. If the Tcl_UniChar does not exist in the given string,
* the return value is NULL.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
CONST char *
Tcl_UtfFindLast(string, ch)
CONST char *string; /* The UTF-8 string to be searched. */
int ch; /* The Tcl_UniChar to search for. */
{
int len;
Tcl_UniChar find;
CONST char *last;
last = NULL;
while (1) {
len = TclUtfToUniChar(string, &find);
if (find == ch) {
last = string;
}
if (*string == '\0') {
break;
}
string += len;
}
return last;
}
/*
*---------------------------------------------------------------------------
*
* Tcl_UtfNext --
*
* Given a pointer to some current location in a UTF-8 string,
* move forward one character. The caller must ensure that they
* are not asking for the next character after the last character
* in the string.
*
* Results:
* The return value is the pointer to the next character in
* the UTF-8 string.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
CONST char *
Tcl_UtfNext(str)
CONST char *str; /* The current location in the string. */
{
Tcl_UniChar ch;
return str + TclUtfToUniChar(str, &ch);
}
/*
*---------------------------------------------------------------------------
*
* Tcl_UtfPrev --
*
* Given a pointer to some current location in a UTF-8 string,
* move backwards one character. This works correctly when the
* pointer is in the middle of a UTF-8 character.
*
* Results:
* The return value is a pointer to the previous character in the
* UTF-8 string. If the current location was already at the
* beginning of the string, the return value will also be a
* pointer to the beginning of the string.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
CONST char *
Tcl_UtfPrev(str, start)
CONST char *str; /* The current location in the string. */
CONST char *start; /* Pointer to the beginning of the
* string, to avoid going backwards too
* far. */
{
CONST char *look;
int i, byte;
str--;
look = str;
for (i = 0; i < TCL_UTF_MAX; i++) {
if (look < start) {
if (str < start) {
str = start;
}
break;
}
byte = *((unsigned char *) look);
if (byte < 0x80) {
break;
}
if (byte >= 0xC0) {
return look;
}
look--;
}
return str;
}
/*
*---------------------------------------------------------------------------
*
* Tcl_UniCharAtIndex --
*
* Returns the Unicode character represented at the specified
* character (not byte) position in the UTF-8 string.
*
* Results:
* As above.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
Tcl_UniChar
Tcl_UniCharAtIndex(src, index)
register CONST char *src; /* The UTF-8 string to dereference. */
register int index; /* The position of the desired character. */
{
Tcl_UniChar ch;
while (index >= 0) {
index--;
src += TclUtfToUniChar(src, &ch);
}
return ch;
}
/*
*---------------------------------------------------------------------------
*
* Tcl_UtfAtIndex --
*
* Returns a pointer to the specified character (not byte) position
* in the UTF-8 string.
*
* Results:
* As above.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
CONST char *
Tcl_UtfAtIndex(src, index)
register CONST char *src; /* The UTF-8 string. */
register int index; /* The position of the desired character. */
{
Tcl_UniChar ch;
while (index > 0) {
index--;
src += TclUtfToUniChar(src, &ch);
}
return src;
}
/*
*---------------------------------------------------------------------------
*
* Tcl_UtfBackslash --
*
* Figure out how to handle a backslash sequence.
*
* Results:
* Stores the bytes represented by the backslash sequence in dst and
* returns the number of bytes written to dst. At most TCL_UTF_MAX
* bytes are written to dst; dst must have been large enough to accept
* those bytes. If readPtr isn't NULL then it is filled in with a
* count of the number of bytes in the backslash sequence.
*
* Side effects:
* The maximum number of bytes it takes to represent a Unicode
* character in UTF-8 is guaranteed to be less than the number of
* bytes used to express the backslash sequence that represents
* that Unicode character. If the target buffer into which the
* caller is going to store the bytes that represent the Unicode
* character is at least as large as the source buffer from which
* the backslashed sequence was extracted, no buffer overruns should
* occur.
*
*---------------------------------------------------------------------------
*/
int
Tcl_UtfBackslash(src, readPtr, dst)
CONST char *src; /* Points to the backslash character of
* a backslash sequence. */
int *readPtr; /* Fill in with number of characters read
* from src, unless NULL. */
char *dst; /* Filled with the bytes represented by the
* backslash sequence. */
{
#define LINE_LENGTH 128
int numRead;
int result;
result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);
if (numRead == LINE_LENGTH) {
/* We ate a whole line. Pay the price of a strlen() */
result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
}
if (readPtr != NULL) {
*readPtr = numRead;
}
return result;
}
/*
*----------------------------------------------------------------------
*
* Tcl_UtfToUpper --
*
* Convert lowercase characters to uppercase characters in a UTF
* string in place. The conversion may shrink the UTF string.
*
* Results:
* Returns the number of bytes in the resulting string
* excluding the trailing null.
*
* Side effects:
* Writes a terminating null after the last converted character.
*
*----------------------------------------------------------------------
*/
int
Tcl_UtfToUpper(str)
char *str; /* String to convert in place. */
{
Tcl_UniChar ch, upChar;
char *src, *dst;
int bytes;
/*
* Iterate over the string until we hit the terminating null.
*/
src = dst = str;
while (*src) {
bytes = TclUtfToUniChar(src, &ch);
upChar = Tcl_UniCharToUpper(ch);
/*
* To keep badly formed Utf strings from getting inflated by
* the conversion (thereby causing a segfault), only copy the
* upper case char to dst if its size is <= the original char.
*/
if (bytes < UtfCount(upChar)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
dst += Tcl_UniCharToUtf(upChar, dst);
}
src += bytes;
}
*dst = '\0';
return (dst - str);
}
/*
*----------------------------------------------------------------------
*
* Tcl_UtfToLower --
*
* Convert uppercase characters to lowercase characters in a UTF
* string in place. The conversion may shrink the UTF string.
*
* Results:
* Returns the number of bytes in the resulting string
* excluding the trailing null.
*
* Side effects:
* Writes a terminating null after the last converted character.
*
*----------------------------------------------------------------------
*/
int
Tcl_UtfToLower(str)
char *str; /* String to convert in place. */
{
Tcl_UniChar ch, lowChar;
char *src, *dst;
int bytes;
/*
* Iterate over the string until we hit the terminating null.
*/
src = dst = str;
while (*src) {
bytes = TclUtfToUniChar(src, &ch);
lowChar = Tcl_UniCharToLower(ch);
/*
* To keep badly formed Utf strings from getting inflated by
* the conversion (thereby causing a segfault), only copy the
* lower case char to dst if its size is <= the original char.
*/
if (bytes < UtfCount(lowChar)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
dst += Tcl_UniCharToUtf(lowChar, dst);
}
src += bytes;
}
*dst = '\0';
return (dst - str);
}
/*
*----------------------------------------------------------------------
*
* Tcl_UtfToTitle --
*
* Changes the first character of a UTF string to title case or
* uppercase and the rest of the string to lowercase. The
* conversion happens in place and may shrink the UTF string.
*
* Results:
* Returns the number of bytes in the resulting string
* excluding the trailing null.
*
* Side effects:
* Writes a terminating null after the last converted character.
*
*----------------------------------------------------------------------
*/
int
Tcl_UtfToTitle(str)
char *str; /* String to convert in place. */
{
Tcl_UniChar ch, titleChar, lowChar;
char *src, *dst;
int bytes;
/*
* Capitalize the first character and then lowercase the rest of the
* characters until we get to a null.
*/
src = dst = str;
if (*src) {
bytes = TclUtfToUniChar(src, &ch);
titleChar = Tcl_UniCharToTitle(ch);
if (bytes < UtfCount(titleChar)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
dst += Tcl_UniCharToUtf(titleChar, dst);
}
src += bytes;
}
while (*src) {
bytes = TclUtfToUniChar(src, &ch);
lowChar = Tcl_UniCharToLower(ch);
if (bytes < UtfCount(lowChar)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
dst += Tcl_UniCharToUtf(lowChar, dst);
}
src += bytes;
}
*dst = '\0';
return (dst - str);
}
/*
*----------------------------------------------------------------------
*
* TclpUtfNcmp2 --
*
* Compare at most n bytes of utf-8 strings cs and ct. Both cs
* and ct are assumed to be at least n bytes long.
*
* Results:
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
TclpUtfNcmp2(cs, ct, n)
CONST char *cs; /* UTF string to compare to ct. */
CONST char *ct; /* UTF string cs is compared to. */
unsigned long n; /* Number of *bytes* to compare. */
{
/*
* We can't simply call 'memcmp(cs, ct, n);' because we need to check
* for Tcl's \xC0\x80 non-utf-8 null encoding.
* Otherwise utf-8 lexes fine in the strcmp manner.
*/
register int result = 0;
for ( ; n != 0; n--, cs++, ct++) {
if (*cs != *ct) {
result = UCHAR(*cs) - UCHAR(*ct);
break;
}
}
if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
unsigned char c1, c2;
c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);
c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct);
result = (c1 - c2);
}
return result;
}
/*
*----------------------------------------------------------------------
*
* Tcl_UtfNcmp --
*
* Compare at most n UTF chars of string cs to string ct. Both cs
* and ct are assumed to be at least n UTF chars long.
*
* Results:
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
Tcl_UtfNcmp(cs, ct, n)
CONST char *cs; /* UTF string to compare to ct. */
CONST char *ct; /* UTF string cs is compared to. */
unsigned long n; /* Number of UTF chars to compare. */
{
Tcl_UniChar ch1, ch2;
/*
* Cannot use 'memcmp(cs, ct, n);' as byte representation of
* \u0000 (the pair of bytes 0xc0,0x80) is larger than byte
* representation of \u0001 (the byte 0x01.)
*/
while (n-- > 0) {
/*
* n must be interpreted as chars, not bytes.
* This should be called only when both strings are of
* at least n chars long (no need for \0 check)
*/
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
return (ch1 - ch2);
}
}
return 0;
}
/*
*----------------------------------------------------------------------
*
* Tcl_UtfNcasecmp --
*
* Compare at most n UTF chars of string cs to string ct case
* insensitive. Both cs and ct are assumed to be at least n
* UTF chars long.
*
* Results:
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
Tcl_UtfNcasecmp(cs, ct, n)
CONST char *cs; /* UTF string to compare to ct. */
CONST char *ct; /* UTF string cs is compared to. */
unsigned long n; /* Number of UTF chars to compare. */
{
Tcl_UniChar ch1, ch2;
while (n-- > 0) {
/*
* n must be interpreted as chars, not bytes.
* This should be called only when both strings are of
* at least n chars long (no need for \0 check)
*/
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);
if (ch1 != ch2) {
return (ch1 - ch2);
}
}
}
return 0;
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharToUpper --
*
* Compute the uppercase equivalent of the given Unicode character.
*
* Results:
* Returns the uppercase Unicode character.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
Tcl_UniChar
Tcl_UniCharToUpper(ch)
int ch; /* Unicode character to convert. */
{
int info = GetUniCharInfo(ch);
if (GetCaseType(info) & 0x04) {
return (Tcl_UniChar) (ch - GetDelta(info));
} else {
return ch;
}
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharToLower --
*
* Compute the lowercase equivalent of the given Unicode character.
*
* Results:
* Returns the lowercase Unicode character.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
Tcl_UniChar
Tcl_UniCharToLower(ch)
int ch; /* Unicode character to convert. */
{
int info = GetUniCharInfo(ch);
if (GetCaseType(info) & 0x02) {
return (Tcl_UniChar) (ch + GetDelta(info));
} else {
return ch;
}
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharToTitle --
*
* Compute the titlecase equivalent of the given Unicode character.
*
* Results:
* Returns the titlecase Unicode character.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
Tcl_UniChar
Tcl_UniCharToTitle(ch)
int ch; /* Unicode character to convert. */
{
int info = GetUniCharInfo(ch);
int mode = GetCaseType(info);
if (mode & 0x1) {
/*
* Subtract or add one depending on the original case.
*/
return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
} else if (mode == 0x4) {
return (Tcl_UniChar) (ch - GetDelta(info));
} else {
return ch;
}
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharLen --
*
* Find the length of a UniChar string. The str input must be null
* terminated.
*
* Results:
* Returns the length of str in UniChars (not bytes).
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
Tcl_UniCharLen(str)
CONST Tcl_UniChar *str; /* Unicode string to find length of. */
{
int len = 0;
while (*str != '\0') {
len++;
str++;
}
return len;
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharNcmp --
*
* Compare at most n unichars of string cs to string ct. Both cs
* and ct are assumed to be at least n unichars long.
*
* Results:
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
Tcl_UniCharNcmp(cs, ct, n)
CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */
CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */
unsigned long n; /* Number of unichars to compare. */
{
#ifdef WORDS_BIGENDIAN
/*
* We are definitely on a big-endian machine; memcmp() is safe
*/
return memcmp(cs, ct, n*sizeof(Tcl_UniChar));
#else /* !WORDS_BIGENDIAN */
/*
* We can't simply call memcmp() because that is not lexically correct.
*/
for ( ; n != 0; cs++, ct++, n--) {
if (*cs != *ct) {
return (*cs - *ct);
}
}
return 0;
#endif /* WORDS_BIGENDIAN */
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharNcasecmp --
*
* Compare at most n unichars of string cs to string ct case
* insensitive. Both cs and ct are assumed to be at least n
* unichars long.
*
* Results:
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
Tcl_UniCharNcasecmp(cs, ct, n)
CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */
CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */
unsigned long n; /* Number of unichars to compare. */
{
for ( ; n != 0; n--, cs++, ct++) {
if (*cs != *ct) {
Tcl_UniChar lcs = Tcl_UniCharToLower(*cs);
Tcl_UniChar lct = Tcl_UniCharToLower(*ct);
if (lcs != lct) {
return (lcs - lct);
}
}
}
return 0;
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharIsAlnum --
*
* Test if a character is an alphanumeric Unicode character.
*
* Results:
* Returns 1 if character is alphanumeric.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
Tcl_UniCharIsAlnum(ch)
int ch; /* Unicode character to test. */
{
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharIsAlpha --
*
* Test if a character is an alphabetic Unicode character.
*
* Results:
* Returns 1 if character is alphabetic.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
Tcl_UniCharIsAlpha(ch)
int ch; /* Unicode character to test. */
{
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
return ((ALPHA_BITS >> category) & 1);
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharIsControl --
*
* Test if a character is a Unicode control character.
*
* Results:
* Returns non-zero if character is a control.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
Tcl_UniCharIsControl(ch)
int ch; /* Unicode character to test. */
{
return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharIsDigit --
*
* Test if a character is a numeric Unicode character.
*
* Results:
* Returns non-zero if character is a digit.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
Tcl_UniCharIsDigit(ch)
int ch; /* Unicode character to test. */
{
return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
== DECIMAL_DIGIT_NUMBER);
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharIsGraph --
*
* Test if a character is any Unicode print character except space.
*
* Results:
* Returns non-zero if character is printable, but not space.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
Tcl_UniCharIsGraph(ch)
int ch; /* Unicode character to test. */
{
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharIsLower --
*
* Test if a character is a lowercase Unicode character.
*
* Results:
* Returns non-zero if character is lowercase.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
Tcl_UniCharIsLower(ch)
int ch; /* Unicode character to test. */
{
return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharIsPrint --
*
* Test if a character is a Unicode print character.
*
* Results:
* Returns non-zero if character is printable.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
Tcl_UniCharIsPrint(ch)
int ch; /* Unicode character to test. */
{
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
return ((PRINT_BITS >> category) & 1);
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharIsPunct --
*
* Test if a character is a Unicode punctuation character.
*
* Results:
* Returns non-zero if character is punct.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
Tcl_UniCharIsPunct(ch)
int ch; /* Unicode character to test. */
{
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
return ((PUNCT_BITS >> category) & 1);
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharIsSpace --
*
* Test if a character is a whitespace Unicode character.
*
* Results:
* Returns non-zero if character is a space.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
Tcl_UniCharIsSpace(ch)
int ch; /* Unicode character to test. */
{
register int category;
/*
* If the character is within the first 127 characters, just use the
* standard C function, otherwise consult the Unicode table.
*/
if (ch < 0x80) {
return isspace(UCHAR(ch)); /* INTL: ISO space */
} else {
category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
return ((SPACE_BITS >> category) & 1);
}
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharIsUpper --
*
* Test if a character is a uppercase Unicode character.
*
* Results:
* Returns non-zero if character is uppercase.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
Tcl_UniCharIsUpper(ch)
int ch; /* Unicode character to test. */
{
return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharIsWordChar --
*
* Test if a character is alphanumeric or a connector punctuation
* mark.
*
* Results:
* Returns 1 if character is a word character.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
Tcl_UniCharIsWordChar(ch)
int ch; /* Unicode character to test. */
{
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
}
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharCaseMatch --
*
* See if a particular Unicode string matches a particular pattern.
* Allows case insensitivity. This is the Unicode equivalent of
* the char* Tcl_StringCaseMatch. The UniChar strings must be
* NULL-terminated. This has no provision for counted UniChar
* strings, thus should not be used where NULLs are expected in the
* UniChar string. Use TclUniCharMatch where possible.
*
* Results:
* The return value is 1 if string matches pattern, and
* 0 otherwise. The matching operation permits the following
* special characters in the pattern: *?\[] (see the manual
* entry for details on what these mean).
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
Tcl_UniCharCaseMatch(string, pattern, nocase)
CONST Tcl_UniChar *string; /* Unicode String. */
CONST Tcl_UniChar *pattern; /* Pattern, which may contain special
* characters. */
int nocase; /* 0 for case sensitive, 1 for insensitive */
{
Tcl_UniChar ch1, p;
while (1) {
p = *pattern;
/*
* See if we're at the end of both the pattern and the string. If
* so, we succeeded. If we're at the end of the pattern but not at
* the end of the string, we failed.
*/
if (p == 0) {
return (*string == 0);
}
if ((*string == 0) && (p != '*')) {
return 0;
}
/*
* Check for a "*" as the next pattern character. It matches any
* substring. We handle this by skipping all the characters up to the
* next matching one in the pattern, and then calling ourselves
* recursively for each postfix of string, until either we match or we
* reach the end of the string.
*/
if (p == '*') {
/*
* Skip all successive *'s in the pattern
*/
while (*(++pattern) == '*') {}
p = *pattern;
if (p == 0) {
return 1;
}
if (nocase) {
p = Tcl_UniCharToLower(p);
}
while (1) {
/*
* Optimization for matching - cruise through the string
* quickly if the next char in the pattern isn't a special
* character
*/
if ((p != '[') && (p != '?') && (p != '\\')) {
if (nocase) {
while (*string && (p != *string)
&& (p != Tcl_UniCharToLower(*string))) {
string++;
}
} else {
while (*string && (p != *string)) { string++; }
}
}
if (Tcl_UniCharCaseMatch(string, pattern, nocase)) {
return 1;
}
if (*string == 0) {
return 0;
}
string++;
}
}
/*
* Check for a "?" as the next pattern character. It matches
* any single character.
*/
if (p == '?') {
pattern++;
string++;
continue;
}
/*
* Check for a "[" as the next pattern character. It is followed
* by a list of characters that are acceptable, or by a range
* (two characters separated by "-").
*/
if (p == '[') {
Tcl_UniChar startChar, endChar;
pattern++;
ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
string++;
while (1) {
if ((*pattern == ']') || (*pattern == 0)) {
return 0;
}
startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
pattern++;
if (*pattern == '-') {
pattern++;
if (*pattern == 0) {
return 0;
}
endChar = (nocase ? Tcl_UniCharToLower(*pattern)
: *pattern);
pattern++;
if (((startChar <= ch1) && (ch1 <= endChar))
|| ((endChar <= ch1) && (ch1 <= startChar))) {
/*
* Matches ranges of form [a-z] or [z-a].
*/
break;
}
} else if (startChar == ch1) {
break;
}
}
while (*pattern != ']') {
if (*pattern == 0) {
pattern--;
break;
}
pattern++;
}
pattern++;
continue;
}
/*
* If the next pattern character is '\', just strip off the '\'
* so we do exact matching on the character that follows.
*/
if (p == '\\') {
if (*(++pattern) == '\0') {
return 0;
}
}
/*
* There's no special character. Just make sure that the next
* bytes of each string match.
*/
if (nocase) {
if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
return 0;
}
} else if (*string != *pattern) {
return 0;
}
string++;
pattern++;
}
}
/*
*----------------------------------------------------------------------
*
* TclUniCharMatch --
*
* See if a particular Unicode string matches a particular pattern.
* Allows case insensitivity. This is the Unicode equivalent of the
* char* Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch
* uses counted Strings, so embedded NULLs are allowed.
*
* Results:
* The return value is 1 if string matches pattern, and
* 0 otherwise. The matching operation permits the following
* special characters in the pattern: *?\[] (see the manual
* entry for details on what these mean).
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int
TclUniCharMatch(string, strLen, pattern, ptnLen, nocase)
CONST Tcl_UniChar *string; /* Unicode String. */
int strLen; /* length of String */
CONST Tcl_UniChar *pattern; /* Pattern, which may contain special
* characters. */
int ptnLen; /* length of Pattern */
int nocase; /* 0 for case sensitive, 1 for insensitive */
{
CONST Tcl_UniChar *stringEnd, *patternEnd;
Tcl_UniChar p;
stringEnd = string + strLen;
patternEnd = pattern + ptnLen;
while (1) {
/*
* See if we're at the end of both the pattern and the string. If
* so, we succeeded. If we're at the end of the pattern but not at
* the end of the string, we failed.
*/
if (pattern == patternEnd) {
return (string == stringEnd);
}
p = *pattern;
if ((string == stringEnd) && (p != '*')) {
return 0;
}
/*
* Check for a "*" as the next pattern character. It matches any
* substring. We handle this by skipping all the characters up to the
* next matching one in the pattern, and then calling ourselves
* recursively for each postfix of string, until either we match or we
* reach the end of the string.
*/
if (p == '*') {
/*
* Skip all successive *'s in the pattern
*/
while (*(++pattern) == '*') {}
if (pattern == patternEnd) {
return 1;
}
p = *pattern;
if (nocase) {
p = Tcl_UniCharToLower(p);
}
while (1) {
/*
* Optimization for matching - cruise through the string
* quickly if the next char in the pattern isn't a special
* character
*/
if ((p != '[') && (p != '?') && (p != '\\')) {
if (nocase) {
while ((string < stringEnd) && (p != *string)
&& (p != Tcl_UniCharToLower(*string))) {
string++;
}
} else {
while ((string < stringEnd) && (p != *string)) {
string++;
}
}
}
if (TclUniCharMatch(string, stringEnd - string,
pattern, patternEnd - pattern, nocase)) {
return 1;
}
if (string == stringEnd) {
return 0;
}
string++;
}
}
/*
* Check for a "?" as the next pattern character. It matches
* any single character.
*/
if (p == '?') {
pattern++;
string++;
continue;
}
/*
* Check for a "[" as the next pattern character. It is followed
* by a list of characters that are acceptable, or by a range
* (two characters separated by "-").
*/
if (p == '[') {
Tcl_UniChar ch1, startChar, endChar;
pattern++;
ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
string++;
while (1) {
if ((*pattern == ']') || (pattern == patternEnd)) {
return 0;
}
startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
pattern++;
if (*pattern == '-') {
pattern++;
if (pattern == patternEnd) {
return 0;
}
endChar = (nocase ? Tcl_UniCharToLower(*pattern)
: *pattern);
pattern++;
if (((startChar <= ch1) && (ch1 <= endChar))
|| ((endChar <= ch1) && (ch1 <= startChar))) {
/*
* Matches ranges of form [a-z] or [z-a].
*/
break;
}
} else if (startChar == ch1) {
break;
}
}
while (*pattern != ']') {
if (pattern == patternEnd) {
pattern--;
break;
}
pattern++;
}
pattern++;
continue;
}
/*
* If the next pattern character is '\', just strip off the '\'
* so we do exact matching on the character that follows.
*/
if (p == '\\') {
if (++pattern == patternEnd) {
return 0;
}
}
/*
* There's no special character. Just make sure that the next
* bytes of each string match.
*/
if (nocase) {
if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
return 0;
}
} else if (*string != *pattern) {
return 0;
}
string++;
pattern++;
}
}