The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/*
 * southeastasian.c - interfaces for South East Asian complex breaking.
 * 
 * Copyright (C) 2009-2012 by Hatuka*nezumi - IKEDA Soji.
 *
 * This file is part of the Sombok Package.  This program is free
 * software; you can redistribute it and/or modify it under the terms of
 * either the GNU General Public License or the Artistic License, as
 * specified in the README file.
 *
 */

#include <assert.h>
#include "sombok_constants.h"
#include "sombok.h"
#ifdef USE_LIBTHAI
#    include "thai/thwchar.h"
#    include "thai/thwbrk.h"
#endif /* USE_LIBTHAI */

/** @defgroup linebreak_southeastasian southeastasian
 * @brief Supports for breaking in South East Asian complex context.
 *
 *@{*/

/** Flag to determin whether South East Asian word segmentation is supported.
 */
const char *linebreak_southeastasian_supported =
#ifdef USE_LIBTHAI
    "Thai:" USE_LIBTHAI " "
#else /* USE_LIBTHAI */
    NULL
#endif /* USE_LIBTHAI */
    ;

void linebreak_southeastasian_flagbreak(gcstring_t * gcstr)
{
#ifdef USE_LIBTHAI
    wchar_t *buf;
    size_t i, j, len;
    int brk, sa;

    if (gcstr == NULL || gcstr->gclen == 0)
	return;
    if (!(((linebreak_t *) gcstr->lbobj)->options &
	  LINEBREAK_OPTION_COMPLEX_BREAKING))
	return;

    len = gcstr->len;

    /* Copy string to temp buffer so that abuse of external module avoided. */
    if ((buf = malloc(sizeof(wchar_t) * (len + 1))) == NULL)
	return;
#ifdef SOMBOK_UNICHAR_T_IS_WCHAR_T
    memcpy(buf, gcstr->str, sizeof(wchar_t) * len);
#else /* SOMBOK_UNICHAR_T_IS_WCHAR_T */
    for (i = 0; i < len; i++)
	buf[i] = (wchar_t) (gcstr->str[i]);
#endif /* SOMBOK_UNICHAR_T_IS_WCHAR_T */
    buf[len] = (wchar_t) 0;

    /*
     * Flag breaking points.
     * Note: th_wbrk() sometimes returns -1 when breaking positions weren't
     * found.
     */
    sa = 0;
    for (i = 0, j = 0; j < len && th_wbrk(buf + j, &brk, 1) == 1; j += brk) {
	/* check if external module is broken. */
	assert(0 <= brk);
	assert(brk < len);

	if (brk == 0) /* This should not cause but is caused by older libthai */
	    break;
	for (; i < gcstr->gclen && gcstr->gcstr[i].idx <= j + brk; i++) {
	    /* check if external module broke temp buffer. */
	    assert(buf[i] == (wchar_t) (gcstr->str[i]));

	    if (gcstr->gcstr[i].lbc == LB_SA) {
		if (!sa)
		    /* skip the first grapheme of each SA block. */
		    sa = 1;
		else if (gcstr->gcstr[i].flag)
		    /* already flagged by _prep(). */
		    ;
		else if (gcstr->gcstr[i].idx != j + brk)
		    /* not grapheme cluster boundary. */
		    ;
		else {
		    propval_t p = PROP_UNKNOWN;

		    linebreak_charprop(gcstr->lbobj,
			gcstr->str[gcstr->gcstr[i].idx - 1],
			&p, NULL, NULL, NULL);
		    /* bogus breaking by libthai on non-SA grapheme extender
		     * (e.g. CM SA). */
		    if (p == LB_SA)
			gcstr->gcstr[i].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
		}
	    } else
		sa = 0;
	}
    }

    free(buf);
#endif /* USE_LIBTHAI */
}