core/KinoSearch/Analysis/Token.cfh

parcel KinoSearch cnick Kino;

/** Unit of text.
 *
 * Token is the fundamental unit used by KinoSearch's Analyzer subclasses.
 * Each Token has 5 attributes: <code>text</code>, <code>start_offset</code>,
 * <code>end_offset</code>, <code>boost</code>, and <code>pos_inc</code>.
 * 
 * The <code>text</code> attribute is a Unicode string encoded as UTF-8.
 * 
 * <code>start_offset</code> is the start point of the token text, measured in
 * Unicode code points from the top of the stored field;
 * <code>end_offset</code> delimits the corresponding closing boundary.
 * <code>start_offset</code> and <code>end_offset</code> locate the Token
 * within a larger context, even if the Token's text attribute gets modified
 * -- by stemming, for instance.  The Token for "beating" in the text "beating
 * a dead horse" begins life with a start_offset of 0 and an end_offset of 7;
 * after stemming, the text is "beat", but the start_offset is still 0 and the
 * end_offset is still 7.  This allows "beating" to be highlighted correctly
 * after a search matches "beat".
 * 
 * <code>boost</code> is a per-token weight.  Use this when you want to assign
 * more or less importance to a particular token, as you might for emboldened
 * text within an HTML document, for example.  (Note: The field this token
 * belongs to must be spec'd to use a posting of type
 * L<KinoSearch::Index::Posting::RichPosting>.)
 * 
 * <code>pos_inc</code is the POSition INCrement, measured in Tokens.  This
 * attribute, which defaults to 1, is a an advanced tool for manipulating
 * phrase matching.  Ordinarily, Tokens are assigned consecutive position
 * numbers: 0, 1, and 2 for <code>"three blind mice"</code>.  However, if you
 * set the position increment for "blind" to, say, 1000, then the three tokens
 * will end up assigned to positions 0, 1, and 1001 -- and will no longer
 * produce a phrase match for the query <code>"three blind mice"</code>.
 */
class KinoSearch::Analysis::Token inherits KinoSearch::Object::Obj {

    char     *text;
    size_t    len;
    uint32_t  start_offset;
    uint32_t  end_offset;
    float     boost;
    int32_t   pos_inc;
    int32_t   pos;

    inert incremented Token* 
    new(const char *text, size_t len, uint32_t start_offset, 
        uint32_t end_offset, float boost = 1.0, int32_t pos_inc = 1);

    inert Token* 
    init(Token *self, const char *text, size_t len, 
         uint32_t start_offset, uint32_t end_offset, 
         float boost = 1.0, int32_t pos_inc = 1);

    /** Sort_quicksort-compatible comparison routine.
     */
    inert int 
    compare(void *context, const void *va, const void *vb);

    uint32_t
    Get_Start_Offset(Token *self);

    uint32_t
    Get_End_Offset(Token *self);

    float
    Get_Boost(Token *self);

    int32_t
    Get_Pos_Inc(Token *self);

    char*
    Get_Text(Token *self);

    size_t
    Get_Len(Token *self);

    void
    Set_Text(Token *self, char *text, size_t len);

    public void 
    Destroy(Token *self);
}

/* Copyright 2006-2011 Marvin Humphrey
 *
 * This program is free software; you can redistribute it and/or modify
 * under the same terms as Perl itself.
 */

	Global
`s`	Focus search bar
`?`	Bring up this help dialog

	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)

	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse

	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)