parcel KinoSearch cnick Kino;
/** Split a string into tokens.
*
* Generically, "tokenizing" is a process of breaking up a string into an
* array of "tokens". For instance, the string "three blind mice" might be
* tokenized into "three", "blind", "mice".
*
* KinoSearch::Analysis::Tokenizer decides where it should break up the text
* based on a regular expression compiled from a supplied <code>pattern</code>
* matching one token. If our source string is...
*
* "Eats, Shoots and Leaves."
*
* ... then a "whitespace tokenizer" with a <code>pattern</code> of
* <code>"\\S+"</code> produces...
*
* Eats,
* Shoots
* and
* Leaves.
*
* ... while a "word character tokenizer" with a <code>pattern</code> of
* <code>"\\w+"</code> produces...
*
* Eats
* Shoots
* and
* Leaves
*
* ... the difference being that the word character tokenizer skips over
* punctuation as well as whitespace when determining token boundaries.
*/
class KinoSearch::Analysis::Tokenizer
inherits KinoSearch::Analysis::Analyzer {
CharBuf *pattern;
void *token_re;
inert incremented Tokenizer*
new(const CharBuf *pattern = NULL);
/**
* @param pattern A string specifying a Perl-syntax regular expression
* which should match one token. The default value is
* <code>\w+(?:[\x{2019}']\w+)*</code>, which matches "it's" as well as
* "it" and "O'Henry's" as well as "Henry".
*/
public inert Tokenizer*
init(Tokenizer *self, const CharBuf *pattern = NULL);
public incremented Inversion*
Transform(Tokenizer *self, Inversion *inversion);
public incremented Inversion*
Transform_Text(Tokenizer *self, CharBuf *text);
/** Tokenize the supplied string and add any Tokens generated to the
* supplied Inversion.
*/
void
Tokenize_Str(Tokenizer *self, const char *text, size_t len,
Inversion *inversion);
/** Set the compiled regular expression for matching a token. Also sets
* <code>pattern</code> as a side effect.
*/
void
Set_Token_RE(Tokenizer *self, void *token_re);
public incremented Obj*
Dump(Tokenizer *self);
public incremented Tokenizer*
Load(Tokenizer *self, Obj *dump);
public bool_t
Equals(Tokenizer *self, Obj *other);
public void
Destroy(Tokenizer *self);
}
/* Copyright 2005-2011 Marvin Humphrey
*
* This program is free software; you can redistribute it and/or modify
* under the same terms as Perl itself.
*/