The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
parcel KinoSearch cnick Kino;

/** Unit of text.
 *
 * Token is the fundamental unit used by KinoSearch's Analyzer subclasses.
 * Each Token has 5 attributes: <code>text</code>, <code>start_offset</code>,
 * <code>end_offset</code>, <code>boost</code>, and <code>pos_inc</code>.
 * 
 * The <code>text</code> attribute is a Unicode string encoded as UTF-8.
 * 
 * <code>start_offset</code> is the start point of the token text, measured in
 * Unicode code points from the top of the stored field;
 * <code>end_offset</code> delimits the corresponding closing boundary.
 * <code>start_offset</code> and <code>end_offset</code> locate the Token
 * within a larger context, even if the Token's text attribute gets modified
 * -- by stemming, for instance.  The Token for "beating" in the text "beating
 * a dead horse" begins life with a start_offset of 0 and an end_offset of 7;
 * after stemming, the text is "beat", but the start_offset is still 0 and the
 * end_offset is still 7.  This allows "beating" to be highlighted correctly
 * after a search matches "beat".
 * 
 * <code>boost</code> is a per-token weight.  Use this when you want to assign
 * more or less importance to a particular token, as you might for emboldened
 * text within an HTML document, for example.  (Note: The field this token
 * belongs to must be spec'd to use a posting of type
 * L<KinoSearch::Index::Posting::RichPosting>.)
 * 
 * <code>pos_inc</code is the POSition INCrement, measured in Tokens.  This
 * attribute, which defaults to 1, is a an advanced tool for manipulating
 * phrase matching.  Ordinarily, Tokens are assigned consecutive position
 * numbers: 0, 1, and 2 for <code>"three blind mice"</code>.  However, if you
 * set the position increment for "blind" to, say, 1000, then the three tokens
 * will end up assigned to positions 0, 1, and 1001 -- and will no longer
 * produce a phrase match for the query <code>"three blind mice"</code>.
 */
class KinoSearch::Analysis::Token inherits KinoSearch::Object::Obj {

    char     *text;
    size_t    len;
    uint32_t  start_offset;
    uint32_t  end_offset;
    float     boost;
    int32_t   pos_inc;
    int32_t   pos;

    inert incremented Token* 
    new(const char *text, size_t len, uint32_t start_offset, 
        uint32_t end_offset, float boost = 1.0, int32_t pos_inc = 1);

    inert Token* 
    init(Token *self, const char *text, size_t len, 
         uint32_t start_offset, uint32_t end_offset, 
         float boost = 1.0, int32_t pos_inc = 1);

    /** Sort_quicksort-compatible comparison routine.
     */
    inert int 
    compare(void *context, const void *va, const void *vb);

    uint32_t
    Get_Start_Offset(Token *self);

    uint32_t
    Get_End_Offset(Token *self);

    float
    Get_Boost(Token *self);

    int32_t
    Get_Pos_Inc(Token *self);

    char*
    Get_Text(Token *self);

    size_t
    Get_Len(Token *self);

    void
    Set_Text(Token *self, char *text, size_t len);

    public void 
    Destroy(Token *self);
}

/* Copyright 2006-2011 Marvin Humphrey
 *
 * This program is free software; you can redistribute it and/or modify
 * under the same terms as Perl itself.
 */