The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#ifdef __cplusplus
extern "C" {
#endif
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#ifdef __cplusplus
}
#endif

#include "cjk-tokenizer.h"

using namespace std;
using namespace cjk;

MODULE = Lingua::CJK::Tokenizer		PACKAGE = Lingua::CJK::Tokenizer

PROTOTYPES: ENABLE

tokenizer * new(SV* CLASS)
    CODE:
        RETVAL = new tokenizer();
    OUTPUT:
        RETVAL

void
tokenizer::ngram_size(U32 ngram_size)
    CODE:
        THIS->ngram_size = ngram_size;

void
tokenizer::max_token_count(U32 max_token_count)
    CODE:
        THIS->max_token_count = max_token_count;

SV*
tokenizer::tokenize(SV* str)
    CODE:
        if (!SvPOK(str)
            || !is_utf8_string((U8*)SvPV(str, SvCUR(str)), SvCUR(str))) {
	    Perl_croak(aTHX_ "The input must be a UTF-8 string");
            XSRETURN_UNDEF;
        }
        vector<string> token_list;
        vector<string>::iterator token_iter;
        string s((const char*) SvPV(str, SvCUR(str)));
        THIS->tokenize(s, token_list);

        AV* tokens = newAV();
        for (token_iter = token_list.begin(); token_iter != token_list.end();
             token_iter++) {
            av_push(tokens, newSVpv(token_iter->c_str(),
                                    (STRLEN) token_iter->length()));
        }
        RETVAL = newRV_noinc((SV*) tokens);
    OUTPUT:
        RETVAL

SV*
tokenizer::split(SV* str)
    CODE:
        if (!SvPOK(str)
            || !is_utf8_string((U8*)SvPV(str, SvCUR(str)), SvCUR(str))) {
	    Perl_croak(aTHX_ "The input must be a UTF-8 string");
            XSRETURN_UNDEF;
        }
        vector<string> token_list;
        vector<string>::iterator token_iter;
        string s((const char*) SvPV(str, SvCUR(str)));
        THIS->split(s, token_list);

        AV* tokens = newAV();
        for (token_iter = token_list.begin(); token_iter != token_list.end();
             token_iter++) {
            av_push(tokens, newSVpv(token_iter->c_str(),
                                    (STRLEN) token_iter->length()));
        }
        RETVAL = newRV_noinc((SV*) tokens);
    OUTPUT:
        RETVAL

SV*
tokenizer::segment(SV* str)
    CODE:
        if (!SvPOK(str)
            || !is_utf8_string((U8*)SvPV(str, SvCUR(str)), SvCUR(str))) {
	    Perl_croak(aTHX_ "The input must be a UTF-8 string");
            XSRETURN_UNDEF;
        }
        vector<string> token_list;
        vector<string>::iterator token_iter;
        string s((const char*) SvPV(str, SvCUR(str)));
        THIS->segment(s, token_list);

        AV* tokens = newAV();
        for (token_iter = token_list.begin(); token_iter != token_list.end();
             token_iter++) {
            av_push(tokens, newSVpv(token_iter->c_str(),
                                    (STRLEN) token_iter->length()));
        }
        RETVAL = newRV_noinc((SV*) tokens);
    OUTPUT:
        RETVAL

bool
tokenizer::has_cjk(SV* str)
    CODE:
        if (!SvPOK(str)
            || !is_utf8_string((U8*)SvPV(str, SvCUR(str)), SvCUR(str))) {
	    Perl_croak(aTHX_ "The input must be a UTF-8 string");
            XSRETURN_UNDEF;
        }
        string s((const char*) SvPV(str, SvCUR(str)));
        RETVAL = THIS->has_cjk(s);
    OUTPUT:
        RETVAL

bool
tokenizer::has_cjk_only(SV* str)
    CODE:
        if (!SvPOK(str)
            || !is_utf8_string((U8*)SvPV(str, SvCUR(str)), SvCUR(str))) {
	    Perl_croak(aTHX_ "The input must be a UTF-8 string");
            XSRETURN_UNDEF;
        }
        string s((const char*) SvPV(str, SvCUR(str)));
        RETVAL = THIS->has_cjk_only(s);

    OUTPUT:
        RETVAL

void
tokenizer::DESTROY()