The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define C_LUCY_HIGHLIGHTER
#define C_LUCY_SPAN
#include <ctype.h>
#include "Lucy/Util/ToolSet.h"

#include "Lucy/Highlight/Highlighter.h"
#include "Lucy/Document/HitDoc.h"
#include "Lucy/Search/Compiler.h"
#include "Lucy/Search/Query.h"
#include "Lucy/Search/Searcher.h"
#include "Lucy/Highlight/HeatMap.h"
#include "Lucy/Search/Span.h"
#include "Lucy/Index/DocVector.h"

const uint32_t ELLIPSIS_CODE_POINT = 0x2026;

/* If Highlighter_Encode has been overridden, return its output.  If not,
 * increment the refcount of the supplied encode_buf and call encode_entities.
 * Either way, the caller takes responsibility for one refcount.
 *
 * The point of this routine is to minimize CharBuf object creation when
 * possible.
 */
static CharBuf*
S_do_encode(Highlighter *self, CharBuf *text, CharBuf **encode_buf);

// Place HTML entity encoded version of [text] into [encoded].
static CharBuf*
S_encode_entities(CharBuf *text, CharBuf *encoded);

Highlighter*
Highlighter_new(Searcher *searcher, Obj *query, const CharBuf *field,
                uint32_t excerpt_length) {
    Highlighter *self = (Highlighter*)VTable_Make_Obj(HIGHLIGHTER);
    return Highlighter_init(self, searcher, query, field, excerpt_length);
}

Highlighter*
Highlighter_init(Highlighter *self, Searcher *searcher, Obj *query,
                 const CharBuf *field, uint32_t excerpt_length) {
    self->query          = Searcher_Glean_Query(searcher, query);
    self->searcher       = (Searcher*)INCREF(searcher);
    self->field          = CB_Clone(field);
    self->compiler       = Query_Make_Compiler(self->query, searcher,
                                               Query_Get_Boost(self->query),
                                               false);
    self->excerpt_length = excerpt_length;
    self->slop           = excerpt_length / 3;
    self->window_width   = excerpt_length + (self->slop * 2);
    self->pre_tag        = CB_new_from_trusted_utf8("<strong>", 8);
    self->post_tag       = CB_new_from_trusted_utf8("</strong>", 9);
    return self;
}

void
Highlighter_destroy(Highlighter *self) {
    DECREF(self->searcher);
    DECREF(self->query);
    DECREF(self->compiler);
    DECREF(self->field);
    DECREF(self->pre_tag);
    DECREF(self->post_tag);
    SUPER_DESTROY(self, HIGHLIGHTER);
}

CharBuf*
Highlighter_highlight(Highlighter *self, const CharBuf *text) {
    size_t   size   = CB_Get_Size(text)
                      + CB_Get_Size(self->pre_tag)
                      + CB_Get_Size(self->post_tag);
    CharBuf *retval = CB_new(size);
    CB_Cat(retval, self->pre_tag);
    CB_Cat(retval, text);
    CB_Cat(retval, self->post_tag);
    return retval;
}

void
Highlighter_set_pre_tag(Highlighter *self, const CharBuf *pre_tag) {
    CB_Mimic(self->pre_tag, (Obj*)pre_tag);
}

void
Highlighter_set_post_tag(Highlighter *self, const CharBuf *post_tag) {
    CB_Mimic(self->post_tag, (Obj*)post_tag);
}

CharBuf*
Highlighter_get_pre_tag(Highlighter *self) {
    return self->pre_tag;
}

CharBuf*
Highlighter_get_post_tag(Highlighter *self) {
    return self->post_tag;
}

CharBuf*
Highlighter_get_field(Highlighter *self) {
    return self->field;
}

Query*
Highlighter_get_query(Highlighter *self) {
    return self->query;
}

Searcher*
Highlighter_get_searcher(Highlighter *self) {
    return self->searcher;
}

Compiler*
Highlighter_get_compiler(Highlighter *self) {
    return self->compiler;
}

uint32_t
Highlighter_get_excerpt_length(Highlighter *self) {
    return self->excerpt_length;
}

CharBuf*
Highlighter_create_excerpt(Highlighter *self, HitDoc *hit_doc) {
    ZombieCharBuf *field_val
        = (ZombieCharBuf*)HitDoc_Extract(hit_doc, self->field,
                                         (ViewCharBuf*)ZCB_BLANK());

    if (!field_val || !Obj_Is_A((Obj*)field_val, CHARBUF)) {
        return NULL;
    }
    else if (!ZCB_Get_Size(field_val)) {
        // Empty string yields empty string.
        return CB_new(0);
    }
    else {
        ZombieCharBuf *fragment = ZCB_WRAP((CharBuf*)field_val);
        CharBuf *raw_excerpt = CB_new(self->excerpt_length + 10);
        CharBuf *highlighted = CB_new((self->excerpt_length * 3) / 2);
        DocVector *doc_vec
            = Searcher_Fetch_Doc_Vec(self->searcher,
                                     HitDoc_Get_Doc_ID(hit_doc));
        VArray *maybe_spans
            = Compiler_Highlight_Spans(self->compiler, self->searcher,
                                       doc_vec, self->field);
        VArray *score_spans = maybe_spans ? maybe_spans : VA_new(0);
        HeatMap *heat_map
            = HeatMap_new(score_spans, (self->excerpt_length * 2) / 3);
        int32_t top
            = Highlighter_Find_Best_Fragment(self, (CharBuf*)field_val,
                                             (ViewCharBuf*)fragment, heat_map);
        VArray *sentences
            = Highlighter_Find_Sentences(self, (CharBuf*)field_val, 0,
                                         top + self->window_width);

        top = Highlighter_Raw_Excerpt(self, (CharBuf*)field_val,
                                      (CharBuf*)fragment, raw_excerpt, top,
                                      heat_map, sentences);
        Highlighter_highlight_excerpt(self, HeatMap_Get_Spans(heat_map),
                                      raw_excerpt, highlighted, top);

        DECREF(sentences);
        DECREF(heat_map);
        DECREF(score_spans);
        DECREF(doc_vec);
        DECREF(raw_excerpt);

        return highlighted;
    }
}

static int32_t
S_hottest(HeatMap *heat_map) {
    float max_score = 0.0f;
    int32_t retval = 0;
    VArray *spans = HeatMap_Get_Spans(heat_map);
    for (uint32_t i = VA_Get_Size(spans); i--;) {
        Span *span = (Span*)VA_Fetch(spans, i);
        if (span->weight >= max_score) {
            retval = span->offset;
            max_score = span->weight;
        }
    }
    return retval;
}

int32_t
Highlighter_find_best_fragment(Highlighter *self, const CharBuf *field_val,
                               ViewCharBuf *fragment, HeatMap *heat_map) {
    // Window is 1.66 * excerpt_length, with the loc in the middle.
    int32_t best_location = S_hottest(heat_map);

    if (best_location < (int32_t)self->slop) {
        // If the beginning of the string falls within the window centered
        // around the hottest point in the field, start the fragment at the
        // beginning.
        ViewCB_Assign(fragment, (CharBuf*)field_val);
        int32_t top = ViewCB_Trim_Top(fragment);
        ViewCB_Truncate(fragment, self->window_width);
        return top;
    }
    else {
        int32_t top = best_location - self->slop;
        ViewCB_Assign(fragment, (CharBuf*)field_val);
        ViewCB_Nip(fragment, top);
        top += ViewCB_Trim_Top(fragment);
        int32_t chars_left = ViewCB_Truncate(fragment, self->excerpt_length);
        int32_t overrun = self->excerpt_length - chars_left;

        if (!overrun) {
            // We've found an acceptable window.
            ViewCB_Assign(fragment, (CharBuf*)field_val);
            ViewCB_Nip(fragment, top);
            top += ViewCB_Trim_Top(fragment);
            ViewCB_Truncate(fragment, self->window_width);
            return top;
        }
        else if (overrun > top) {
            // The field is very short, so make the whole field the
            // "fragment".
            ViewCB_Assign(fragment, (CharBuf*)field_val);
            return ViewCB_Trim_Top(fragment);
        }
        else {
            // The fragment is too close to the end, so slide it back.
            top -= overrun;
            ViewCB_Assign(fragment, (CharBuf*)field_val);
            ViewCB_Nip(fragment, top);
            top += ViewCB_Trim_Top(fragment);
            ViewCB_Truncate(fragment, self->excerpt_length);
            return top;
        }
    }
}

// Return true if the window represented by "offset" and "length" overlaps a
// score span, or if there are no score spans so that no excerpt is measurably
// superior.
static bool_t
S_has_heat(HeatMap *heat_map, int32_t offset, int32_t length) {
    VArray   *spans     = HeatMap_Get_Spans(heat_map);
    uint32_t  num_spans = VA_Get_Size(spans);
    int32_t   end       = offset + length;

    if (length == 0)    { return false; }
    if (num_spans == 0) { return true; }

    for (uint32_t i = 0; i < num_spans; i++) {
        Span *span  = (Span*)VA_Fetch(spans, i);
        int32_t span_start = span->offset;
        int32_t span_end   = span_start + span->length;
        if (offset >= span_start && offset <  span_end) { return true; }
        if (end    >  span_start && end    <= span_end) { return true; }
        if (offset <= span_start && end    >= span_end) { return true; }
        if (span_start > end) { break; }
    }

    return false;
}

int32_t
Highlighter_raw_excerpt(Highlighter *self, const CharBuf *field_val,
                        const CharBuf *fragment, CharBuf *raw_excerpt,
                        int32_t top, HeatMap *heat_map, VArray *sentences) {
    bool_t   found_starting_edge = false;
    bool_t   found_ending_edge   = false;
    int32_t  start = top;
    int32_t  end   = 0;
    double   field_len = CB_Length(field_val);
    uint32_t min_len = field_len < self->excerpt_length * 0.6666
                       ? (uint32_t)field_len
                       : (uint32_t)(self->excerpt_length * 0.6666);

    // Try to find a starting sentence boundary.
    const uint32_t num_sentences = VA_Get_Size(sentences);
    if (num_sentences) {
        for (uint32_t i = 0; i < num_sentences; i++) {
            Span *sentence = (Span*)VA_Fetch(sentences, i);
            int32_t candidate = sentence->offset;

            if (candidate > top + (int32_t)self->window_width) {
                break;
            }
            else if (candidate >= top) {
                // Try to start on the first sentence boundary, but only if
                // there's enough relevant material left after it in the
                // fragment.
                ZombieCharBuf *temp = ZCB_WRAP(fragment);
                ZCB_Nip(temp, candidate - top);
                uint32_t chars_left = ZCB_Truncate(temp, self->excerpt_length);
                if (chars_left >= min_len
                    && S_has_heat(heat_map, candidate, chars_left)
                   ) {
                    start = candidate;
                    found_starting_edge = true;
                    break;
                }
            }
        }
    }

    // Try to end on a sentence boundary (but don't try very hard).
    if (num_sentences) {
        ZombieCharBuf *start_trimmed = ZCB_WRAP(fragment);
        ZCB_Nip(start_trimmed, start - top);

        for (uint32_t i = num_sentences; i--;) {
            Span    *sentence  = (Span*)VA_Fetch(sentences, i);
            int32_t  last_edge = sentence->offset + sentence->length;

            if (last_edge <= start) {
                break;
            }
            else if (last_edge - start > (int32_t)self->excerpt_length) {
                continue;
            }
            else {
                uint32_t chars_left = last_edge - start;
                if (chars_left > min_len
                    && S_has_heat(heat_map, start, chars_left)
                   ) {
                    found_ending_edge = true;
                    end = last_edge;
                    break;
                }
                else {
                    ZombieCharBuf *temp = ZCB_WRAP((CharBuf*)start_trimmed);
                    ZCB_Nip(temp, chars_left);
                    ZCB_Trim_Tail(temp);
                    if (ZCB_Get_Size(temp) == 0) {
                        // Short, but ending on a boundary already.
                        found_ending_edge = true;
                        end = last_edge;
                        break;
                    }
                }
            }
        }
    }
    int32_t this_excerpt_len = found_ending_edge
                               ? end - start
                               : (int32_t)self->excerpt_length;
    if (!this_excerpt_len) { return start; }

    if (found_starting_edge) {
        ZombieCharBuf *temp = ZCB_WRAP((CharBuf*)field_val);
        ZCB_Nip(temp, start);
        ZCB_Truncate(temp, this_excerpt_len);
        CB_Mimic(raw_excerpt, (Obj*)temp);
    }
    // If not starting on a sentence boundary, prepend an ellipsis.
    else {
        ZombieCharBuf *temp = ZCB_WRAP((CharBuf*)field_val);
        const size_t ELLIPSIS_LEN = 2; // Unicode ellipsis plus a space.

        // If the excerpt is already shorter than the spec'd length, we might
        // not need to make room.
        this_excerpt_len += ELLIPSIS_LEN;

        // Move the start back one in case the character right before the
        // excerpt starts is whitespace.
        if (start) {
            this_excerpt_len += 1;
            start -= 1;
            ZCB_Nip(temp, start);
        }

        do {
            uint32_t code_point = ZCB_Nip_One(temp);
            start++;
            this_excerpt_len--;

            if (StrHelp_is_whitespace(code_point)) {
                if (!found_ending_edge) {
                    // If we still need room, we'll lop it off the end since
                    // we don't know a solid end point yet.
                    break;
                }
                else if (this_excerpt_len <= (int32_t)self->excerpt_length) {
                    break;
                }
            }
        } while (ZCB_Get_Size(temp));

        ZCB_Truncate(temp, self->excerpt_length - ELLIPSIS_LEN);
        CB_Cat_Char(raw_excerpt, ELLIPSIS_CODE_POINT);
        CB_Cat_Char(raw_excerpt, ' ');
        CB_Cat(raw_excerpt, (CharBuf*)temp);
        start -= ELLIPSIS_LEN;
    }

    // If excerpt doesn't end on a sentence boundary, tack on an ellipsis.
    if (found_ending_edge) {
        CB_Truncate(raw_excerpt, end - start);
    }
    else {
        do {
            uint32_t code_point = CB_Code_Point_From(raw_excerpt, 1);
            CB_Chop(raw_excerpt, 1);
            if (StrHelp_is_whitespace(code_point)) {
                CB_Trim_Tail(raw_excerpt);

                // Strip punctuation that collides with an ellipsis.
                code_point = CB_Code_Point_From(raw_excerpt, 1);
                while (code_point == '.'
                       || code_point == ','
                       || code_point == ';'
                       || code_point == ':'
                       || code_point == ':'
                       || code_point == '?'
                       || code_point == '!'
                      ) {
                    CB_Chop(raw_excerpt, 1);
                    code_point = CB_Code_Point_From(raw_excerpt, 1);
                }

                break;
            }
        } while (CB_Get_Size(raw_excerpt));
        CB_Cat_Char(raw_excerpt, ELLIPSIS_CODE_POINT);
    }

    return start;
}

void
Highlighter_highlight_excerpt(Highlighter *self, VArray *spans,
                              CharBuf *raw_excerpt, CharBuf *highlighted,
                              int32_t top) {
    int32_t        last_end        = 0;
    ZombieCharBuf *temp            = ZCB_WRAP(raw_excerpt);
    CharBuf       *encode_buf      = NULL;
    int32_t        raw_excerpt_end = top + CB_Length(raw_excerpt);

    for (uint32_t i = 0, max = VA_Get_Size(spans); i < max; i++) {
        Span *span = (Span*)VA_Fetch(spans, i);
        if (span->offset < top) {
            continue;
        }
        else if (span->offset >= raw_excerpt_end) {
            break;
        }
        else {
            int32_t relative_start = span->offset - top;
            int32_t relative_end   = relative_start + span->length;

            if (relative_start > last_end) {
                CharBuf *encoded;
                int32_t non_highlighted_len = relative_start - last_end;
                ZombieCharBuf *to_cat = ZCB_WRAP((CharBuf*)temp);
                ZCB_Truncate(to_cat, non_highlighted_len);
                encoded = S_do_encode(self, (CharBuf*)to_cat, &encode_buf);
                CB_Cat(highlighted, (CharBuf*)encoded);
                ZCB_Nip(temp, non_highlighted_len);
                DECREF(encoded);
            }
            if (relative_end > relative_start) {
                CharBuf *encoded;
                CharBuf *hl_frag;
                int32_t highlighted_len = relative_end - relative_start;
                ZombieCharBuf *to_cat = ZCB_WRAP((CharBuf*)temp);
                ZCB_Truncate(to_cat, highlighted_len);
                encoded = S_do_encode(self, (CharBuf*)to_cat, &encode_buf);
                hl_frag = Highlighter_Highlight(self, encoded);
                CB_Cat(highlighted, hl_frag);
                ZCB_Nip(temp, highlighted_len);
                DECREF(hl_frag);
                DECREF(encoded);
            }
            last_end = relative_end;
        }
    }

    // Last text, beyond last highlight span.
    if (ZCB_Get_Size(temp)) {
        CharBuf *encoded = S_do_encode(self, (CharBuf*)temp, &encode_buf);
        CB_Cat(highlighted, encoded);
        DECREF(encoded);
    }
    CB_Trim_Tail(highlighted);

    DECREF(encode_buf);
}

static Span*
S_start_sentence(int32_t pos) {
    return Span_new(pos, 0, 0.0);
}

static void
S_close_sentence(VArray *sentences, Span **sentence_ptr,
                 int32_t sentence_end) {
    Span *sentence = *sentence_ptr;
    int32_t length = sentence_end - Span_Get_Offset(sentence);
    const int32_t MIN_SENTENCE_LENGTH = 3; // e.g. "OK.", but not "2."
    if (length >= MIN_SENTENCE_LENGTH) {
        Span_Set_Length(sentence, length);
        VA_Push(sentences, (Obj*)sentence);
        *sentence_ptr = NULL;
    }
}

VArray*
Highlighter_find_sentences(Highlighter *self, CharBuf *text, int32_t offset,
                           int32_t length) {
    /* When [sentence] is NULL, that means a sentence start has not yet been
     * found.  When it is a Span object, we have a start, but we haven't found
     * an end.  Once we find the end, we add the sentence to the [sentences]
     * array and set [sentence] back to NULL to indicate that we're looking
     * for a start once more.
     */
    Span    *sentence       = NULL;
    VArray  *sentences      = VA_new(10);
    int32_t  stop           = length == 0
                              ? I32_MAX
                              : offset + length;
    ZombieCharBuf *fragment = ZCB_WRAP(text);
    int32_t  pos            = ZCB_Trim_Top(fragment);
    UNUSED_VAR(self);

    /* Our first task will be to find a sentence that either starts at the top
     * of the fragment, or overlaps its start. Starting at the top of the
     * field is a special case: we define the first non-whitespace character
     * to begin a sentence, rather than look for the first character following
     * a period and whitespace.  Everywhere else, we have to define sentence
     * starts based on a sentence end that has just passed by.
     */
    if (offset <= pos) {
        // Assume that first non-whitespace character begins a sentence.
        if (pos < stop && ZCB_Get_Size(fragment) > 0) {
            sentence = S_start_sentence(pos);
        }
    }
    else {
        ZCB_Nip(fragment, offset - pos);
        pos = offset;
    }

    while (1) {
        uint32_t code_point = ZCB_Code_Point_At(fragment, 0);
        if (!code_point) {
            // End of fragment.  If we have a sentence open, close it,
            // then bail.
            if (sentence) { S_close_sentence(sentences, &sentence, pos); }
            break;
        }
        else if (code_point == '.') {
            uint32_t whitespace_count;
            pos += ZCB_Nip(fragment, 1); // advance past "."

            if (pos == stop && ZCB_Get_Size(fragment) == 0) {
                // Period ending the field string.
                if (sentence) { S_close_sentence(sentences, &sentence, pos); }
                break;
            }
            else if (0 != (whitespace_count = ZCB_Trim_Top(fragment))) {
                // We've found a period followed by whitespace.  Close out the
                // existing sentence, if there is one. */
                if (sentence) { S_close_sentence(sentences, &sentence, pos); }

                // Advance past whitespace.
                pos += whitespace_count;
                if (pos < stop && ZCB_Get_Size(fragment) > 0) {
                    // Not at the end of the string? Then we've found a
                    // sentence start.
                    sentence = S_start_sentence(pos);
                }
            }

            // We may not have reached the end of the field yet, but it's
            // entirely possible that our last sentence overlapped the end of
            // the fragment -- in which case, it's time to bail.
            if (pos >= stop) { break; }
        }
        else {
            ZCB_Nip(fragment, 1);
            pos++;
        }
    }

    return sentences;
}

CharBuf*
Highlighter_encode(Highlighter *self, CharBuf *text) {
    CharBuf *encoded = CB_new(0);
    UNUSED_VAR(self);
    return S_encode_entities(text, encoded);
}

static CharBuf*
S_do_encode(Highlighter *self, CharBuf *text, CharBuf **encode_buf) {
    if (OVERRIDDEN(self, Highlighter, Encode, encode)) {
        return Highlighter_Encode(self, text);
    }
    else {
        if (*encode_buf == NULL) { *encode_buf = CB_new(0); }
        (void)S_encode_entities(text, *encode_buf);
        return (CharBuf*)INCREF(*encode_buf);
    }
}

static CharBuf*
S_encode_entities(CharBuf *text, CharBuf *encoded) {
    ZombieCharBuf *temp = ZCB_WRAP(text);
    size_t space = 0;
    const int MAX_ENTITY_BYTES = 9; // &#dddddd;

    // Scan first so that we only allocate once.
    uint32_t code_point;
    while (0 != (code_point = ZCB_Nip_One(temp))) {
        if (code_point > 127
            || (!isgraph(code_point) && !isspace(code_point))
            || code_point == '<'
            || code_point == '>'
            || code_point == '&'
            || code_point == '"'
           ) {
            space += MAX_ENTITY_BYTES;
        }
        else {
            space += 1;
        }
    }

    CB_Grow(encoded, space);
    CB_Set_Size(encoded, 0);
    ZCB_Assign(temp, text);
    while (0 != (code_point = ZCB_Nip_One(temp))) {
        if (code_point > 127
            || (!isgraph(code_point) && !isspace(code_point))
           ) {
            CB_catf(encoded, "&#%u32;", code_point);
        }
        else if (code_point == '<') {
            CB_Cat_Trusted_Str(encoded, "&lt;", 4);
        }
        else if (code_point == '>') {
            CB_Cat_Trusted_Str(encoded, "&gt;", 4);
        }
        else if (code_point == '&') {
            CB_Cat_Trusted_Str(encoded, "&amp;", 5);
        }
        else if (code_point == '"') {
            CB_Cat_Trusted_Str(encoded, "&quot;", 6);
        }
        else {
            CB_Cat_Char(encoded, code_point);
        }
    }

    return encoded;
}