cjk-tokenizer/test.cc - metacpan.org

#include <iostream>
#include "cjk-tokenizer.h"

using namespace std;
using namespace cjk;

class handler : public tokenizer_handler {
public:
    void handle_token(const std::string &tok, bool is_cjk) {
        cout << "Handling [" << tok << "] by handler class. "
             << "(CJK: " << is_cjk << ")" << endl;
    };
};

int main() {
    tokenizer tknzr;
    vector<string> token_list;
    vector<string>::iterator token_iter;
    char text[] =
        "美女遊戲等你挑戰周蕙最新鈴搶先下載茄子醬耍可愛一流"
        "华沙是波兰的首都，也是其最大的城市。"
        "납치 여중생 공포에 떠는데'…경찰 200m 거리 25분만에 출동"
        "寛永通宝の一。京都方広寺の大仏をこわして1668年（寛文8）から鋳造した銅銭。"
        "ㄅㄆㄇㄈㄉㄊㄋㄌㄧㄨㄩ"
        "Giant Microwave Turns Plastic Back to Oil";
    string text_str = string(text);

    cout << "[Default]" << endl;
    token_list.clear();
    cout << "Ngram size: "  << tknzr.ngram_size << endl;
    tknzr.tokenize(text_str, token_list);
    cout << "Original string: " << text << endl;
    cout << "Tokenized result: ";
    for (token_iter = token_list.begin();
         token_iter != token_list.end(); ++token_iter) {
        cout << "[" << *token_iter << "] ";
    }
    cout << endl << endl;

    cout << "[Trigram]" << endl;
    token_list.clear();
    tknzr.ngram_size = 3;
    cout << "Ngram size: "  << tknzr.ngram_size << endl;
    tknzr.tokenize(text_str, token_list);
    cout << "Original string: " << text << endl;
    cout << "Tokenized result: ";
    for (token_iter = token_list.begin();
         token_iter != token_list.end(); ++token_iter) {
        cout << "[" << *token_iter << "] ";
    }
    cout << endl << endl;

    cout << "[Pentagram]" << endl;
    token_list.clear();
    tknzr.ngram_size = 5;
    cout << "Ngram size: "  << tknzr.ngram_size << endl;
    tknzr.tokenize(text_str, token_list);
    cout << "Original string: " << text << endl;
    cout << "Tokenized result: ";
    for (token_iter = token_list.begin();
         token_iter != token_list.end(); ++token_iter) {
        cout << "[" << *token_iter << "] ";
    }
    cout << endl << endl;

    cout << "[Max token count]" << endl;
    token_list.clear();
    tknzr.max_token_count = 10;
    cout << "Max token count: " << tknzr.max_token_count << endl;
    tknzr.tokenize(text_str, token_list);
    cout << "Original string: " << text << endl;
    cout << "Tokenized result: ";
    for (token_iter = token_list.begin();
         token_iter != token_list.end(); ++token_iter) {
        cout << "[" << *token_iter << "] ";
    }
    cout << endl << endl;

    cout << "[Unigram]" << endl;
    token_list.clear();
    tknzr.ngram_size = 1;
    tknzr.max_token_count = 0;
    tknzr.tokenize(text_str, token_list);
    cout << "Original string: " << text << endl;
    cout << "Tokenized result: ";
    for (token_iter = token_list.begin();
         token_iter != token_list.end(); ++token_iter) {
        cout << "[" << *token_iter << "] ";
    }
    cout << endl << endl;

    cout << "[Split]" << endl;
    token_list.clear();
    tknzr.split(text_str, token_list);
    cout << "Original string: " << text << endl;
    cout << "Tokenized result: ";
    for (token_iter = token_list.begin();
         token_iter != token_list.end(); ++token_iter) {
        cout << "[" << *token_iter << "] ";
    }
    cout << endl << endl;

    vector<unicode_char_t> uchar_list;
    vector<unicode_char_t>::iterator uchar_iter;

    cout << "[Split (unicode_char_t)]" << endl;
    tknzr.split(text_str, uchar_list);
    cout << "Original string: " << text << endl;
    cout << "Tokenized result: ";
    for (uchar_iter = uchar_list.begin();
         uchar_iter != uchar_list.end(); uchar_iter++) {
        cout << "[" << *uchar_iter << "] ";
    }
    cout << endl << endl;

    cout << "-- CJK Segmentation" << endl;
    cout << "Original string: " << text << endl;
    token_list.clear();
    tknzr.segment(text_str, token_list);
    cout << "Tokenized result: ";
    for (token_iter = token_list.begin();
         token_iter != token_list.end(); ++token_iter) {
        cout << "[" << *token_iter << "] ";
    }
    cout << endl << endl;

    cout << "[Split] (zh_tw -> zh_cn)" << endl;
    token_list.clear();
    tknzr.han_conv_method = HAN_CONV_TRAD2SIMP;
    tknzr.split(text_str, token_list);
    cout << "Original string: " << text << endl;
    cout << "Tokenized result: ";
    for (token_iter = token_list.begin();
         token_iter != token_list.end(); ++token_iter) {
        cout << "[" << *token_iter << "] ";
    }
    cout << endl << endl;

    cout << "[Split] (zh_cn -> zh_tw)" << endl;
    token_list.clear();
    tknzr.han_conv_method = HAN_CONV_SIMP2TRAD;
    tknzr.split(text_str, token_list);
    cout << "Original string: " << text << endl;
    cout << "Tokenized result: ";
    for (token_iter = token_list.begin();
         token_iter != token_list.end(); ++token_iter) {
        cout << "[" << *token_iter << "] ";
    }
    cout << endl << endl;

    handler h;
    cout << "-- CJK Tokenizer Handler" << endl;
    tknzr.han_conv_method = HAN_CONV_NONE;
    tknzr.tokenize(text_str, h);
    cout << endl << endl;
    
    string cjk_str = "這是CJK字串";
    string pure_cjk_str =
        "這個字串只含中日韓。"
        "コンピューターの機能を、音響・映像・作品制御などに利用する芸術の総称。"
        "납치 여중생 공포에 떠는데'…경찰 200m 거리 25분만에 출동"
        "ㄅㄆㄇㄈㄉㄊㄋㄌㄧㄨㄩ";
    cout << "[" << cjk_str << "]" << " has CJK characters? "
         << tknzr.has_cjk(cjk_str) << endl;
    cout << "[" << cjk_str << "]" << " has CJK characters only? "
         << tknzr.has_cjk_only(cjk_str) << endl;
    cout << "[" << pure_cjk_str << "]" << " has CJK characters? "
         << tknzr.has_cjk(pure_cjk_str) << endl;
    cout << "[" << pure_cjk_str << "]" << " has CJK characters only? "
         << tknzr.has_cjk_only(pure_cjk_str) << endl;
    cout << endl;
    return 0;
}
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)