The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#include "charset.h"

size_t ansi_to_ucs2le(
	byte *dst, size_t dst_len, byte *src, size_t src_len
) {
	byte *sp = src, *se = src + src_len;
	if( src == NULL )
		return 0;
	dst_len --;
	if( dst != NULL ) {
		byte *dp = dst, *de = dst + dst_len;
		for( ; sp < se && dp < de; sp ++ ) {
			*dp ++ = (byte) *sp;
			*dp ++ = '\0';
		}
		*dp = '\0';
		return dp - dst;
	}
	return src_len * 2;
}

size_t ansi_to_ucs2be(
	byte *dst, size_t dst_len, byte *src, size_t src_len
) {
	byte *sp = src, *se = src + src_len;
	if( src == NULL )
		return 0;
	dst_len --;
	if( dst != NULL ) {
		byte *dp = dst, *de = dst + dst_len;
		for( ; sp < se && dp < de; sp ++ ) {
			*dp ++ = '\0';
			*dp ++ = (byte) *sp;
		}
		*dp = '\0';
		return dp - dst;
	}
	return src_len * 2;
}

INLINE size_t ansi_to_utf8(
	byte *dst, size_t dst_len, byte *src, size_t src_len
) {
	byte *sp = src, *se = src + src_len;
	if( src == NULL )
		return 0;
	if( dst != NULL ) {
		byte *dp = dst, *de = dst + dst_len;
		uint c;
		for( ; sp < se && dp < de; sp ++ ) {
			if( (c = *sp) <= 0x7F ) {
				*dp ++ = (byte) c;
			}
			else {
				if( dp + 2 >= de )
					break;
				*dp ++ = (byte) (0xC0 | (c >> 6));
				*dp ++ = (byte) (0x80 | (c & 0x3F));
			}
		}
		*dp = '\0';
		return dp - dst;
	}
	else {
		size_t count = 0;
		for( ; sp < se; sp ++ ) {
			if( *sp > 0x7F )
				count ++;
			count ++;
		}
		return count;
	}
}

size_t ucs2le_to_utf8(
	byte *dst, size_t dst_len, byte *src, size_t src_len
) {
	size_t i = 0, count = 0;
	uint c;
	if( src == NULL )
		return 0;
	src_len --;
	if( dst != NULL ) {
		while( i < src_len && count < dst_len ) {
			c = (src[i + 1] << 8) | src[i];
			i += 2;
			if( c <= 0x7F ) {
				dst[count ++] = (byte) c;
			}
			else
			if( c > 0x7FF ) {
				if( count + 3 > dst_len )
					break;
				dst[count ++] = (byte) (0xE0 | (c >> 12));
				dst[count ++] = (byte) (0x80 | ((c >> 6) & 0x3F));
				dst[count ++] = (byte) (0x80 | (c & 0x3F));
			}
			else {
				if( count + 2 > dst_len )
					break;
				dst[count ++] = (byte) (0xC0 | (c >> 6));
				dst[count ++] = (byte) (0x80 | (c & 0x3F));
			}
		}
		if( count >= dst_len )
			count = dst_len - 1;
		dst[count] = '\0';
	}
	else {
		while( i < src_len ) {
			c = (src[i + 1] << 8) | src[i];
			i += 2;
			if( c > 0x7F ) {
				if( c > 0x7FF )
					count ++;
				count ++;
			}
			count ++;
		}
	}
	return count;
}

size_t ucs2le_to_ucs2be(
	byte *dst, size_t dst_len, byte *src, size_t src_len
) {
	size_t i = 0, count = 0;
	if( src == NULL )
		return 0;
	src_len --;
	dst_len --;
	if( dst != NULL ) {
		while( i < src_len && count < dst_len ) {
			dst[count ++] = src[i + 1];
			dst[count ++] = src[i];
			i += 2;
		}
		if( count >= dst_len )
			count = dst_len - 1;
		dst[count] = '\0';
		return count;
	}
	return src_len;
}

size_t ucs2le_to_ansi(
	byte *dst, size_t dst_len, byte *src, size_t src_len
) {
	size_t i = 0, count = 0;
	if( src == NULL )
		return 0;
	src_len --;
	if( dst != NULL ) {
		while( i < src_len && count < dst_len ) {
			dst[count ++] = src[i];
			i += 2;
		}
		if( count >= dst_len )
			count = dst_len - 1;
		dst[count] = '\0';
		return count;
	}
	return src_len;
}

size_t ucs2be_to_utf8(
	byte *dst, size_t dst_len, byte *src, size_t src_len
) {
	size_t i = 0, count = 0;
	uint c;
	if( src == NULL )
		return 0;
	src_len --;
	if( dst != NULL ) {
		while( i < src_len && count < dst_len ) {
			c = (src[i] << 8) | src[i + 1];
			i += 2;
			if( c <= 0x7F ) {
				dst[count ++] = (byte) c;
			}
			else
			if( c > 0x7FF ) {
				if( count + 3 > dst_len )
					break;
				dst[count ++] = (byte) (0xE0 | (c >> 12));
				dst[count ++] = (byte) (0x80 | ((c >> 6) & 0x3F));
				dst[count ++] = (byte) (0x80 | (c & 0x3F));
			}
			else {
				if( count + 2 > dst_len )
					break;
				dst[count ++] = (byte) (0xC0 | (c >> 6));
				dst[count ++] = (byte) (0x80 | (c & 0x3F));
			}
		}
		if( count >= dst_len )
			count = dst_len - 1;
		dst[count] = '\0';
	}
	else {
		while( i < src_len ) {
			c = (src[i] << 8) | src[i + 1];
			i += 2;
			if( c > 0x7F ) {
				if( c > 0x7FF )
					count ++;
				count ++;
			}
			count ++;
		}
	}
	return count;
}

size_t ucs2be_to_ucs2le(
	byte *dst, size_t dst_len, byte *src, size_t src_len
) {
	size_t i = 0, count = 0;
	if( src == NULL )
		return 0;
	src_len --;
	dst_len --;
	if( dst != NULL ) {
		while( i < src_len && count < dst_len ) {
			dst[count ++] = src[i + 1];
			dst[count ++] = src[i];
			i += 2;
		}
		if( count >= dst_len )
			count = dst_len - 1;
		dst[count] = '\0';
		return count;
	}
	return src_len;
}

size_t ucs2be_to_ansi(
	byte *dst, size_t dst_len, byte *src, size_t src_len
) {
	size_t i = 0, count = 0;
	if( src == NULL )
		return 0;
	src_len --;
	if( dst != NULL ) {
		while( i < src_len && count < dst_len ) {
			dst[count ++] = src[i + 1];
			i += 2;
		}
		if( count >= dst_len )
			count = dst_len - 1;
		dst[count] = '\0';
		return count;
	}
	return src_len;
}

size_t utf8_to_ansi(
	byte *dst, size_t dst_len, byte *src, size_t src_len
) {
	size_t i = 0, count = 0;
	uint c, wc;
	if( src == NULL )
		return 0;
	if( dst != NULL ) {
		while( i < src_len && count < dst_len ) {
			wc = src[i ++];
			if( wc & 0x80 ) {
				if( i >= src_len )
					return CS_CNV_ERROR;
				wc &= 0x3F;
				if( wc & 0x20 ) {
					c = src[i ++];
					if( (c & 0xC0) != 0x80 || i >= src_len )
						return CS_CNV_ERROR;
					wc = (wc << 6) | (c & 0x3F);
				}
				c = src[i ++];
				if( (c & 0xC0) != 0x80 )
					return CS_CNV_ERROR;
				/*
				c = (wc << 6) | (c & 0x3F);
				printf( "set1 ucs %x %x\n", c & 0xff, (c >> 8) & 0xff );
				*/
				dst[count] = (char) ((wc << 6) | (c & 0x3F));
			}
			else {
				//printf( "set2 ucs %x\n", wc );
				dst[count] = (char) wc;
			}
			count ++;
		}
		if( count >= dst_len )
			count = dst_len - 1;
		dst[count] = '\0';
	}
	else {
		while( i < src_len ) {
			c = src[i ++];
			if( c & 0x80 ) {
				if( i >= src_len )
					return CS_CNV_ERROR;
				c &= 0x3F;
				if( c & 0x20 ) {
					c = src[i ++];
					if( (c & 0xC0) != 0x80 || i >= src_len )
						return CS_CNV_ERROR;
				}
				c = src[i ++];
				if( (c & 0xC0) != 0x80 )
					return CS_CNV_ERROR;
			}
			count ++;
		}
	}
	return count;
}

size_t utf8_to_ucs2le(
	byte *dst, size_t dst_len, byte *src, size_t src_len
) {
	size_t i = 0, count = 0;
	uint c, wc;
	if( src == NULL )
		return 0;
	if( dst != NULL ) {
		while( i < src_len && count < dst_len ) {
			wc = src[i ++];
			if( wc & 0x80 ) {
				if( i >= src_len )
					return CS_CNV_ERROR;
				wc &= 0x3F;
				if( wc & 0x20 ) {
					c = src[i ++];
					if( (c & 0xC0) != 0x80 || i >= src_len )
						return CS_CNV_ERROR;
					wc = (wc << 6) | (c & 0x3F);
				}
				c = src[i ++];
				if( (c & 0xC0) != 0x80 )
					return CS_CNV_ERROR;
				c = (wc << 6) | (c & 0x3F);
				if( count + 2 > dst_len )
					break;
				dst[count ++] = c & 0xFF;
				dst[count ++] = (c >> 8) & 0xFF;
			}
			else {
				if( count + 2 > dst_len )
					break;
				dst[count ++] = wc & 0xFF;
				dst[count ++] = 0;
			}
		}
		if( count >= dst_len )
			count = dst_len - 1;
		dst[count] = '\0';
	}
	else {
		while( i < src_len ) {
			c = src[i ++];
			if( c & 0x80 ) {
				if( i >= src_len )
					return CS_CNV_ERROR;
				c &= 0x3F;
				if( c & 0x20 ) {
					c = src[i ++];
					if( (c & 0xC0) != 0x80 || i >= src_len )
						return CS_CNV_ERROR;
				}
				c = src[i ++];
				if( (c & 0xC0) != 0x80 )
					return CS_CNV_ERROR;
			}
			count += 2;
		}
	}
	return count;
}

/* utf-8 to unicode/widechar */
size_t utf8_to_ucs2be(
	byte *dst, size_t dst_len, byte *src, size_t src_len
) {
	size_t i = 0, count = 0;
	uint c, wc;
	if( src == NULL )
		return 0;
	if( dst != NULL ) {
		while( i < src_len && count < dst_len ) {
			wc = src[i ++];
			if( wc & 0x80 ) {
				if( i >= src_len )
					return CS_CNV_ERROR;
				wc &= 0x3F;
				if( wc & 0x20 ) {
					c = src[i ++];
					if( (c & 0xC0) != 0x80 || i >= src_len )
						return CS_CNV_ERROR;
					wc = (wc << 6) | (c & 0x3F);
				}
				c = src[i ++];
				if( (c & 0xC0) != 0x80 )
					return CS_CNV_ERROR;
				c = (wc << 6) | (c & 0x3F);
				if( count + 2 > dst_len )
					break;
				dst[count ++] = (c >> 8) & 0xFF;
				dst[count ++] = c & 0xFF;
			}
			else {
				if( count + 2 > dst_len )
					break;
				dst[count ++] = 0;
				dst[count ++] = wc & 0xFF;
			}
		}
		if( count >= dst_len )
			count = dst_len - 1;
		dst[count] = '\0';
	}
	else {
		while( i < src_len ) {
			c = src[i ++];
			if( c & 0x80 ) {
				if( i >= src_len )
					return CS_CNV_ERROR;
				c &= 0x3F;
				if( c & 0x20 ) {
					c = src[i ++];
					if( (c & 0xC0) != 0x80 || i >= src_len )
						return CS_CNV_ERROR;
				}
				c = src[i ++];
				if( (c & 0xC0) != 0x80 )
					return CS_CNV_ERROR;
			}
			count += 2;
		}
	}
	return count;
}

INLINE size_t utf8_strlen( const char *str ) {
	size_t count;
	unsigned int wc;
	for( count = 0; *str != '\0'; str ++, count ++ ) {
		if( *str & 0x80 ) {
			wc = *str & 0x3F;
			if( wc & 0x20 ) {
				str ++;
				count ++;
			}
			str ++;
			count ++;
		}
	}
	return count;
}

INLINE const char *get_charset_name( enum n_charset cs ) {
	switch( cs ) {
	case CS_UTF8:
		return "UTF-8";
	case CS_UCS2BE:
		return "UNICODE";
	case CS_UCS2LE:
		return "UTF-16LE";
	case CS_ANSI:
		return "ANSI";
	default:
		return "unknown";
	}
}

INLINE enum n_charset get_charset_id( const char *name ) {
	if( name == NULL )
		return CS_ANSI;
	if( my_stricmp( name, "UTF8" ) == 0 ||
		my_stricmp( name, "UTF-8" ) == 0
	) {
		return CS_UTF8;
	}
	if( my_stricmp( name, "UTF16" ) == 0 ||
		my_stricmp( name, "UTF-16" ) == 0 ||
		my_stricmp( name, "UCS2" ) == 0 ||
		my_stricmp( name, "UNICODE" ) == 0
	) {
		return CS_UTF16;
	}
	if( my_stricmp( name, "UCS2BE" ) == 0 ||
		my_stricmp( name, "UTF-16BE" ) == 0 ||
		my_stricmp( name, "UTF16BE" ) == 0
	) {
		return CS_UCS2BE;
	}
	if( my_stricmp( name, "UCS2LE" ) == 0 ||
		my_stricmp( name, "UTF-16LE" ) == 0 ||
		my_stricmp( name, "UTF16LE" ) == 0
	) {
		return CS_UCS2LE;
	}
	if( my_stricmp( name, "ANSI" ) == 0 ||
		my_stricmp( name, "ASCII" ) == 0
	) {
		return CS_ANSI;
	}
	return CS_UNKNOWN;
}

INLINE size_t charset_convert(
	char *src, size_t src_len, enum n_charset cs_src, enum n_charset cs_dst,
	char **p_dst
) {
	size_t len = 0;
	if( ! src_len )
		return 0;
#ifdef CSV_DEBUG
	_debug( "charset_convert from %d to %d\n", (int) cs_src, (int) cs_dst );
#endif
	if( cs_src == cs_dst )
		goto noconv;
	switch( cs_dst ) {
	case CS_UTF8:
		len = src_len * 3 + 1;
		Renew( (*p_dst), len, char );
		switch( cs_src ) {
		case CS_ANSI:
			len = ansi_to_utf8( (byte *) (*p_dst), len, (byte *) src, src_len );
			break;
		case CS_UCS2LE:
			len = ucs2le_to_utf8( (byte *) (*p_dst), len, (byte *) src, src_len );
			break;
		case CS_UCS2BE:
			len = ucs2be_to_utf8( (byte *) (*p_dst), len, (byte *) src, src_len );
			break;
		default:
			goto noconv;
		}
		break;
	case CS_ANSI:
		len = src_len + 1;
		Renew( (*p_dst), len, char );
		switch( cs_src ) {
		case CS_UTF8:
			len = utf8_to_ansi( (byte *) (*p_dst), len, (byte *) src, src_len );
			break;
		case CS_UCS2LE:
			len = ucs2le_to_ansi( (byte *) (*p_dst), len, (byte *) src, src_len );
			break;
		case CS_UCS2BE:
			len = ucs2be_to_ansi( (byte *) (*p_dst), len, (byte *) src, src_len );
			break;
		default:
			goto noconv;
		}
		break;
	case CS_UCS2LE:
		len = src_len * 2 + 1;
		Renew( (*p_dst), len, char );
		switch( cs_src ) {
		case CS_ANSI:
			len = ansi_to_ucs2le( (byte *) (*p_dst), len, (byte *) src, src_len );
			break;
		case CS_UTF8:
			len = utf8_to_ucs2le( (byte *) (*p_dst), len, (byte *) src, src_len );
			break;
		case CS_UCS2BE:
			len = ucs2be_to_ucs2le( (byte *) (*p_dst), len, (byte *) src, src_len );
			break;
		default:
			goto noconv;
		}
		break;
	case CS_UCS2BE:
		len = src_len * 2 + 1;
		Renew( (*p_dst), len, char );
		switch( cs_src ) {
		case CS_ANSI:
			len = ansi_to_ucs2be( (byte *) (*p_dst), len, (byte *) src, src_len );
			break;
		case CS_UTF8:
			len = utf8_to_ucs2be( (byte *) (*p_dst), len, (byte *) src, src_len );
			break;
		case CS_UCS2LE:
			len = ucs2le_to_ucs2be( (byte *) (*p_dst), len, (byte *) src, src_len );
			break;
		default:
			goto noconv;
		}
		break;
	default:
		goto noconv;
	}
	if( len == CS_CNV_ERROR )
		return CS_CNV_ERROR;
	if( len > 0 )
		Renew( (*p_dst), len + 1, char );
	return len;
noconv:
	Renew( (*p_dst), src_len + 1, char );
	Copy( src, (*p_dst), src_len, char );
	(*p_dst)[src_len] = '\0';
	return src_len;
	/*
error:
	return CS_CNV_ERROR;
	*/
}

INLINE SV *
charset_quote( enum n_charset cs, const char *str, size_t str_len ) {
	const char *sp = str, *se = str + str_len;
	char *tmp, *so;
	SV *sv;
	uint wc;
	Newx( tmp, str_len * 4 + 5, char );
	so = tmp;
	switch( cs ) {
	case CS_ANSI:
	default:
		*so ++ = '\'';
		for( ; sp < se; sp ++ ) {
			if( *sp == '\'' ) {
				*so ++ = '\'';
				*so ++ = '\'';
			}
			else {
				*so ++ = *sp;
			}
		}
		*so ++ = '\'';
		break;
	case CS_UTF8:
		*so ++ = '\'';
		for( ; sp < se; sp ++ ) {
			wc = (unsigned char) *sp;
			if( wc & 0x80 ) {
				*so ++ = (char) wc;
				wc &= 0x3F;
				if( wc & 0x20 ) {
					if( ++ sp == se )
						break;
					*so ++ = *sp;
				}
				if( ++ sp == se )
					break;
				*so ++ = *sp;
			}
			else if( wc == '\'' ) {
				*so ++ = '\'';
				*so ++ = '\'';
			}
			else {
				*so ++ = (char) wc;
			}
		}
		*so ++ = '\'';
		break;
	case CS_UCS2BE:
		if( (str_len % 2) != 0 )
			goto exit;
		*so ++ = '\0';
		*so ++ = '\'';
		for( ; sp < se; sp ++ ) {
			if( *sp == '\0' ) {
				*so ++ = '\0', sp ++;
				if( *sp == '\'' ) {
					*so ++ = '\'';
					*so ++ = '\0';
					*so ++ = '\'';
				}
				else {
					*so ++ = *sp;
				}
			}
			else {
				*so ++ = *sp ++;
				*so ++ = *sp;
			}
		}
		*so ++ = '\0';
		*so ++ = '\'';
		break;
	case CS_UCS2LE:
		if( (str_len % 2) != 0 )
			goto exit;
		*so ++ = '\'';
		*so ++ = '\0';
		for( ; sp < se; sp ++ ) {
			if( *sp == '\'' ) {
				*so ++ = '\'', sp ++;
				if( *sp == '\0' ) {
					*so ++ = '\0';
					*so ++ = '\'';
					*so ++ = '\0';
				}
				else {
					*so ++ = *sp;
				}
			}
			else {
				*so ++ = *sp ++;
				*so ++ = *sp;
			}
		}
		*so ++ = '\'';
		*so ++ = '\0';
		break;
	}
exit:
	sv = newSVpvn( tmp, so - tmp );
	Safefree( tmp );
	return sv;
}

INLINE SV *
charset_quote_id( enum n_charset cs, const char **args, int argc ) {
	SV *sv;
	char *tmp, *so, *se;
	const char *sp;
	size_t len = 138 + argc * 32;
	int i;
	unsigned int wc;
	Newx( tmp, len, char );
	so = tmp;
	se = tmp + len - 10;
	switch( cs ) {
	case CS_ANSI:
	default:
		for( i = 0; i < argc; i ++ ) {
			sp = args[i];
			if( *sp == '\0' )
				continue;
			*so ++ = '\"';
			while( 1 ) {
				wc = *sp ++;
				if( so >= se ) {
					Renew( tmp, len + 138, char );
					so = tmp + len;
					se = so + 128;
				}
				if( wc == '\0' ) {
					*so ++ = '"';
					break;
				}
				else if( wc == '\"' ) {
					*so ++ = '\"';
					*so ++ = '\"';
				}
				else {
					*so ++ = (char) wc;
				}
			}
			*so ++ = '.'; 
		}
		so --;
		break;
	case CS_UTF8:
		for( i = 0; i < argc; i ++ ) {
			sp = args[i];
			if( *sp == '\0' )
				continue;
			*so ++ = '\"';
			while( 1 ) {
				wc = *sp ++;
				if( so >= se ) {
					Renew( tmp, len + 138, char );
					so = tmp + len;
					se = so + 128;
				}
				if( wc == '\0' ) {
					*so ++ = '"';
					break;
				}
				else if( wc == '\"' ) {
					*so ++ = '\"';
					*so ++ = '\"';
				}
				else
				if( wc & 0x80 ) {
					*so ++ = (char) wc;
					wc &= 0x3F;
					if( wc & 0x20 ) {
						if( *sp == '\0' )
							break;
						*so ++ = *sp ++;
					}
					if( *sp == '\0' )
						break;
					*so ++ = *sp ++;
				}
				else {
					*so ++ = (char) wc;
				}
			}
			*so ++ = '.'; 
		}
		so --;
		break;
	case CS_UCS2LE:
		for( i = 0; i < argc; i ++ ) {
			sp = args[i];
			if( *sp == '\0' )
				continue;
			*so ++ = '\"';
			*so ++ = '\0';
			while( 1 ) {
				wc = *sp ++;
				if( so >= se ) {
					Renew( tmp, len + 138, char );
					so = tmp + len;
					se = so + 128;
				}
				if( wc == '\0' ) {
					if( *sp == '\0' ) {
						*so ++ = '"';
						*so ++ = '\0';
						break;
					}
					else {
						*so ++ = '\0';
						*so ++ = *sp ++;
					}
				}
				else if( wc == '\"' ) {
					if( *sp == '\0' ) {
						*so ++ = '\"';
						*so ++ = '\0';
						*so ++ = '\"';
						*so ++ = '\0';
						sp ++;
					}
					else {
						*so ++ = '\"';
						*so ++ = *sp ++;
					}
				}
				else {
					*so ++ = (char) wc;
					*so ++ = *sp ++;
				}
			}
			*so ++ = '.'; 
			*so ++ = '\0'; 
		}
		so -= 2;
		break;
	case CS_UCS2BE:
#ifdef CSV_DEBUG
		_debug( "quote id ucs2be\n" );
#endif
		for( i = 0; i < argc; i ++ ) {
			sp = args[i];
			if( sp[0] == '\0' && sp[1] == '\0' )
				continue;
			*so ++ = '\0';
			*so ++ = '\"';
			while( 1 ) {
				wc = *sp ++;
				if( so >= se ) {
					Renew( tmp, len + 138, char );
					so = tmp + len;
					se = so + 128;
				}
				if( wc == '\0' ) {
					if( *sp == '\0' ) {
						*so ++ = '"';
						*so ++ = '\0';
						break;
					}
					else if( *sp == '\"' ) {
						*so ++ = '\0';
						*so ++ = '\"';
						*so ++ = '\0';
						*so ++ = '\"';
						sp ++;
					}
					else {
						*so ++ = '\0';
						*so ++ = *sp ++;
					}
				}
				else {
					*so ++ = (char) wc;
					*so ++ = *sp ++;
				}
			}
			*so ++ = '\0'; 
			*so ++ = '.'; 
		}
		so -= 2;
		break;
	}
	if( so >= tmp )
		sv = newSVpvn( tmp, so - tmp );
	else
		sv = newSV( 0 );
	Safefree( tmp );
	return sv;
}