The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

parcel Lucy;

/** Judge how well a document matches a query.
 *
 * After determining whether a document matches a given query, a score must be
 * calculated which indicates how *well* the document matches the query.  The
 * Similarity class is used to judge how "similar" the query and the document
 * are to each other; the closer the resemblance, they higher the document
 * scores.
 *
 * The default implementation uses Lucene's modified cosine similarity
 * measure.  Subclasses might tweak the existing algorithms, or might be used
 * in conjunction with custom Query subclasses to implement arbitrary scoring
 * schemes.
 *
 * Most of the methods operate on single fields, but some are used to combine
 * scores from multiple fields.
 */

public class Lucy::Index::Similarity nickname Sim inherits Clownfish::Obj {

    float  *norm_decoder;

    /** Constructor. Takes no arguments.
     */
    public inert incremented Similarity*
    new();

    /** Initialize a Similarity.
     */
    public inert Similarity*
    init(Similarity *self);

    /** Factory method for creating a Posting.
     */
    incremented Posting*
    Make_Posting(Similarity *self);

    /** Factory method for creating a PostingWriter.
     */
    incremented PostingWriter*
    Make_Posting_Writer(Similarity *self, Schema *schema, Snapshot *snapshot,
                        Segment *segment, PolyReader *polyreader,
                        int32_t field_num);

    /** Return a score factor based on the frequency of a term in a given
     * document.  The default implementation is sqrt(freq).  Other
     * implementations typically produce ascending scores with ascending
     * freqs, since the more times a doc matches, the more relevant it is
     * likely to be.
     */
    float
    TF(Similarity *self, float freq);

    /** Calculate the Inverse Document Frequecy for a term in a given
     * collection.
     *
     * @param doc_freq The number of documents that the term appears in.
     * @param total_docs The number of documents in the collection.
     */
    float
    IDF(Similarity *self, int64_t doc_freq, int64_t total_docs);

    /** Calculate a score factor based on the number of terms which match.
     */
    float
    Coord(Similarity *self, uint32_t overlap, uint32_t max_overlap);

    /** Dampen the scores of long documents.
     *
     * After a field is broken up into terms at index-time, each term must be
     * assigned a weight.  One of the factors in calculating this weight is
     * the number of tokens that the original field was broken into.
     *
     * Typically, we assume that the more tokens in a field, the less
     * important any one of them is -- so that, e.g. 5 mentions of "Kafka" in
     * a short article are given more heft than 5 mentions of "Kafka" in an
     * entire book.  The default implementation of length_norm expresses this
     * using an inverted square root.
     *
     * However, the inverted square root has a tendency to reward very short
     * fields highly, which isn't always appropriate for fields you expect to
     * have a lot of tokens on average.
     */
    public float
    Length_Norm(Similarity *self, uint32_t num_tokens);

    /** Normalize a Query's weight so that it is comparable to other Queries.
     */
    float
    Query_Norm(Similarity *self, float sum_of_squared_weights);

    /** encode_norm and decode_norm encode and decode between 32-bit IEEE
     * floating point numbers and a 6-bit exponent, 3-bit mantissa float.  The
     * range covered by the single-byte encoding is 7x10^9 to 2x10^-9.  The
     * accuracy is about one significant decimal digit.
     */
    uint8_t
    Encode_Norm(Similarity *self, float f);

    /** See encode_norm.
     */
    float
    Decode_Norm(Similarity *self, uint32_t input);

    float*
    Get_Norm_Decoder(Similarity *self);

    public void
    Destroy(Similarity *self);

    incremented Obj*
    Dump(Similarity *self);

    incremented Similarity*
    Load(Similarity *self, Obj *dump);

    public bool
    Equals(Similarity *self, Obj *other);

    void
    Serialize(Similarity *self, OutStream *outstream);

    incremented Similarity*
    Deserialize(decremented Similarity *self, InStream *instream);
}