cld-src/encodings/compact_lang_det/getonescriptspan.cc

// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "encodings/compact_lang_det/getonescriptspan.h"
#include <stdio.h>
#include <string.h>

#include "encodings/lang_enc.h"
#include "encodings/compact_lang_det/utf8propjustletter.h"
#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
#include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"

#include "encodings/compact_lang_det/win/cld_basictypes.h"
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
#include "encodings/compact_lang_det/win/cld_google.h"
#include "encodings/compact_lang_det/win/cld_htmlutils.h"
#include "encodings/compact_lang_det/win/cld_unilib.h"
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
#include "encodings/compact_lang_det/win/cld_utf8utils.h"

static const Language GRAY_LANG = (Language)254;

static const int kMaxUpToWordBoundary = 50;       // span < this make longer,
                                                  // else make shorter
static const int kMaxAdvanceToWordBoundary = 10;  // +/- this many bytes
                                                  // to round to word boundary,
                                                  // direction above

static const char kSpecialSymbol[256] = {       // true for < > &
  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
};



#define LT 0      // <
#define GT 1      // >
#define EX 2      // !
#define HY 3      // -
#define QU 4      // "
#define AP 5      // '
#define SL 6      // /
#define S_ 7
#define C_ 8
#define R_ 9
#define I_ 10
#define P_ 11
#define T_ 12
#define Y_ 13
#define L_ 14
#define E_ 15
#define CR 16     // <cr> or <lf>
#define NL 17     // non-letter: ASCII whitespace, digit, punctuation
#define PL 18     // possible letter, incl. &
#define xx 19     // <unused>

// Map byte to one of ~20 interesting categories for cheap tag parsing
static const uint8 kCharToSub[256] = {
  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,

  PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
  P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
  PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
  P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,

  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,

  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
};

#undef LT
#undef GT
#undef EX
#undef HY
#undef QU
#undef AP
#undef SL
#undef S_
#undef C_
#undef R_
#undef I_
#undef P_
#undef T_
#undef Y_
#undef L_
#undef E_
#undef CR
#undef NL
#undef PL
#undef xx


#define OK 0
#define X_ 1

// State machine to do cheap parse of non-letter strings incl. tags
// advances <tag>
//          |    |
// advances <tag> ... </tag>  for <script> <style>
//          |               |
// advances <!-- ... <tag> ... -->
//          |                     |
// advances <tag
//          ||  (0)
// advances <tag <tag2>
//          ||  (0)
static const uint8 kTagParseTbl_0[] = {
// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
   3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [0] OK
  X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error
   3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [2] NL*
  X_, 2, 4, 9, 10,11, 9,13,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [3] <
  X_, 2, 9, 5, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [4] <!
  X_, 2, 9, 6, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [5] <!-
   6, 6, 6, 7,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [6] <!--.*
   6, 6, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [7] <!--.*-
   6, 2, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [8] <!--.*--
  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [9] <.*
  10,10,10,10,  9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
  11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
  X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '

// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
  X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9,  9, 9, 9,X_, // [13] <S
  X_, 2, 9, 9, 10,11, 9, 9,  9,15, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [14] <SC
  X_, 2, 9, 9, 10,11, 9, 9,  9, 9,16, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [15] <SCR
  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9,17,  9, 9, 9, 9,  9, 9, 9,X_, // [16] <SCRI
  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9, 18, 9, 9, 9,  9, 9, 9,X_, // [17] <SCRIP
  X_,19, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
  20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
  19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
  19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</
  19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
  19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
  19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
  19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
  19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
  19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT

// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9,29, 9, 9,  9, 9, 9,X_, // [28] <ST
  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9,30, 9,  9, 9, 9,X_, // [29] <STY
  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9,31,  9, 9, 9,X_, // [30] <STYL
  X_,32, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
  33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
  32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
  32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</
  32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
  32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
};

#undef OK
#undef X_


/*
// Convert GetTimeOfDay output to 64-bit usec
static inline uint64 Microseconds(const struct timeval& t) {
  // The SumReducer uses uint64, so convert to (uint64) microseconds,
  // not (double) seconds.
  return t.tv_sec * 1000000ULL + t.tv_usec;
}
*/


// Returns true if character is < > or &
bool inline IsSpecial(char c) {
  if ((c & 0xe0) == 0x20) {
    return kSpecialSymbol[static_cast<uint8>(c)];
  }
  return false;
}

// Quick Skip to next letter or < > & or to end of string (eos)
// Always return is_letter for eos
int ScanToLetterOrSpecial(const char* src, int len) {
  int bytes_consumed;
  cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
                       &bytes_consumed);
  return bytes_consumed;
}



// src points to non-letter, such as tag-opening '<'
// Return length from here to next possible letter
// On eos or another < before >, return 1
// advances <tag>
//          |    |
// advances <tag> ... </tag>  for <script> <style>
//          |               |
// advances <!-- ... <tag> ... -->
//          |                     |
// advances <tag
//          ||  (1)
// advances <tag <tag2>
//          ||  (1)
int ScanToPossibleLetter(const char* isrc, int len) {
  const uint8* src = reinterpret_cast<const uint8*>(isrc);
  const uint8* srclimit = src + len;
  const uint8* tagParseTbl = kTagParseTbl_0;
  int e = 0;
  while (src < srclimit) {
    e = tagParseTbl[kCharToSub[*src++]];
    if ((e & ~1) == 0) {
      // We overshot by one byte
      --src;
      break;
    }
    tagParseTbl = &kTagParseTbl_0[e * 20];
  }

  if (src >= srclimit) {
    // We fell off the end of the text.
    // It looks like the most common case for this is a truncated file, not
    // mismatched angle brackets. So we pretend that the last char was '>'
    return len;
  }

  // OK to be in state 0 or state 2 at exit
  if ((e != 0) && (e != 2)) {
    // Error, '<' followed by '<'
    // We want to back up to first <, then advance by one byte past it
    int offset = src - reinterpret_cast<const uint8*>(isrc);
    // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);

    // Backscan to first '<' and return enough length to just get past it
    --offset;   // back up over the second '<', which caused us to stop
    while ((0 < offset) && (isrc[offset] != '<')) {
      // Find the first '<', which is unmatched
      --offset;
    }
    // skip to just beyond first '<'
    // printf("  returning %d\n", offset + 1);
    return offset + 1;
  }

  return src - reinterpret_cast<const uint8*>(isrc);
}



ScriptScanner::ScriptScanner(const char* buffer,
                             int buffer_length,
                             bool is_plain_text)
  : start_byte_(buffer),
  next_byte_(buffer),
  next_byte_limit_(buffer + buffer_length),
  byte_length_(buffer_length),
  is_plain_text_(is_plain_text) {
    script_buffer_ = new char[getone::kMaxScriptBuffer];
    script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
}

ScriptScanner::~ScriptScanner() {
  delete[] script_buffer_;
  delete[] script_buffer_lower_;
}




// Get to the first real non-tag letter or entity that is a letter
// Sets script of that letter
// Return len if no more letters
int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
  int sc = UNKNOWN_LSCRIPT;
  int skip = 0;
  int tlen, plen;

  // Do run of non-letters (tag | &NL | NL)*
  while (skip < len) {
    // Do fast scan to next interesting byte
    // int oldskip = skip;
    skip += ScanToLetterOrSpecial(src + skip, len - skip);
    // TEMP
    // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
    //       oldskip, src[oldskip], skip, src[skip]);

    // Check for no more letters/specials
    if (skip >= len) {
      // All done
      return len;
    }

    // We are at a letter, nonletter, tag, or entity
    if (IsSpecial(src[skip]) && !is_plain_text_) {
      if (src[skip] == '<') {
        // Begining of tag; skip to end and go around again
        tlen = ScanToPossibleLetter(src + skip, len - skip);
        sc = 0;
        // printf("<...> ");
      } else if (src[skip] == '>') {
        // Unexpected end of tag; skip it and go around again
        tlen = 1;         // Over the >
        sc = 0;
        // printf("..> ");
      } else if (src[skip] == '&') {
        // Expand entity, no advance
        char temp[4];
        EntityToBuffer(src + skip, len - skip,
                       temp, &tlen, &plen);
        sc = getone::GetUTF8LetterScriptNum(temp);
        // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
      }
    } else {
      // Update 1..4 bytes
      tlen = cld_UniLib::OneCharLen(src + skip);
      sc = getone::GetUTF8LetterScriptNum(src + skip);
      // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
    }
    // TEMP
    // printf("sc=%d ", sc);
    if (sc != 0) {break;}           // Letter found
    skip += tlen;                   // Advance
  }

  *script = sc;
  return skip;
}



// Copy next run of same-script non-tag letters to buffer [NUL terminated]
// Buffer has leading space and all text is lowercased
bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
  span->text = script_buffer_;
  span->text_bytes = 0;
  span->offset = next_byte_ - start_byte_;
  span->script = UNKNOWN_LSCRIPT;
  span->lang = UNKNOWN_LANGUAGE;
  span->truncated = false;

  // printf("GetOneScriptSpan[[ ");
  // struct timeval script_start, script_mid, script_end;

  int spanscript;           // The script of this span
  int sc = UNKNOWN_LSCRIPT;  // The script of next character
  int tlen, plen;


  script_buffer_[0] = ' ';  // Always a space at front of output
  script_buffer_[1] = '\0';
  int take = 0;
  int put = 1;              // Start after the initial space

  // gettimeofday(&script_start, NULL);
  // Get to the first real non-tag letter or entity that is a letter
  int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
  next_byte_ += skip;
  byte_length_ -= skip;
  if (byte_length_ <= 0) {
    // printf("]]\n");
    return false;               // No more letters to be found
  }

  // gettimeofday(&script_mid, NULL);

  // There is at least one letter, so we know the script for this span
  // printf("{%d} ", spanscript);
  span->script = (UnicodeLScript)spanscript;


  // Go over alternating spans of same-script letters and non-letters,
  // copying letters to buffer with single spaces for each run of non-letters
  while (take < byte_length_) {
    // Copy run of letters in same script (&LS | LS)*
    int letter_count = 0;              // Keep track of word length
    bool need_break = false;
    while (take < byte_length_) {
      // We are at a letter, nonletter, tag, or entity
      if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
        // printf("\"%c\" ", next_byte_[take]);
        if (next_byte_[take] == '<') {
          // Begining of tag
          sc = 0;
          break;
        } else if (next_byte_[take] == '>') {
          // Unexpected end of tag
          sc = 0;
          break;
        } else if (next_byte_[take] == '&') {
          // Copy entity, no advance
          EntityToBuffer(next_byte_ + take, byte_length_ - take,
                         script_buffer_ + put, &tlen, &plen);
          sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
        }
      } else {
        // Real letter, safely copy up to 4 bytes, increment by 1..4
        // Will update by 1..4 bytes at Advance, below
        tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
        if (take < (byte_length_ - 3)) {
          // Fast case
          *reinterpret_cast<uint32*>(script_buffer_ + put) =
            *reinterpret_cast<const uint32*>(next_byte_ + take);
        } else {
          // Slow case, happens 1-3 times per input document
          memcpy(script_buffer_ + put, next_byte_ + take, plen);
        }
        sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
      }
      // printf("sc(%c)=%d ", next_byte_[take], sc);
      // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
      // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);

      // Allow continue across a single letter in a different script:
      // A B D = three scripts, c = common script, i = inherited script,
      // - = don't care, ( = take position before the += below
      //  AAA(A-    continue
      //
      //  AAA(BA    continue
      //  AAA(BB    break
      //  AAA(Bc    continue (breaks after B)
      //  AAA(BD    break
      //  AAA(Bi    break
      //
      //  AAA(c-    break
      //
      //  AAA(i-    continue
      //

      if ((sc != spanscript) && (sc != ULScript_Inherited)) {
        // Might need to break this script span
        if (sc == ULScript_Common) {
          need_break = true;
        } else {
          // Look at next following character, ignoring entity as Common
          int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
          if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
            need_break = true;
          }
        }
      }
      if (need_break) {break;}  // Non-letter or letter in wrong script

      take += tlen;                   // Advance
      put += plen;                    // Advance
      ++letter_count;
      if (put >= getone::kMaxScriptBytes) {
        // Buffer is full
        span->truncated = true;
        break;
      }
    }     // End while letters

    // Do run of non-letters (tag | &NL | NL)*
    while (take < byte_length_) {
      // Do fast scan to next interesting byte
      take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
      
      // Check for no more letters/specials
      if (take >= byte_length_) {
        take = byte_length_;
        break;
      }

      // We are at a letter, nonletter, tag, or entity
      if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
        // printf("\"%c\" ", next_byte_[take]);
        if (next_byte_[take] == '<') {
          // Begining of tag; skip to end and go around again
          tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
          sc = 0;
          // printf("<...> ");
        } else if (next_byte_[take] == '>') {
          // Unexpected end of tag; skip it and go around again
          tlen = 1;         // Over the >
          sc = 0;
          // printf("..> ");
        } else if (next_byte_[take] == '&') {
          // Expand entity, no advance
          EntityToBuffer(next_byte_ + take, byte_length_ - take,
                         script_buffer_ + put, &tlen, &plen);
          sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
        }
      } else {
        // Update 1..4
        tlen = cld_UniLib::OneCharLen(next_byte_ + take);
        sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
      }
      // printf("sc[%c]=%d ", next_byte_[take], sc);
      if (sc != 0) {break;}           // Letter found
      take += tlen;                   // Advance
    }     // End while not-letters

    script_buffer_[put++] = ' ';

    // We are at a letter again (or eos), after letter* not-letter*
    if (sc != spanscript) {break;}            // Letter in wrong script
    if (put >= getone::kMaxScriptBytes - 8) {
      // Buffer is almost full
      span->truncated = true;
      break;
    }
  }

  // Update input position
  next_byte_ += take;
  byte_length_ -= take;

  // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
  //                          kMaxScriptBytes |   | put
  script_buffer_[put + 0] = ' ';
  script_buffer_[put + 1] = ' ';
  script_buffer_[put + 2] = ' ';
  script_buffer_[put + 3] = '\0';

  span->text_bytes = put;       // Does not include the last four chars above

  // printf(" %d]]\n\n", put);
  return true;
}

// Force Latin, Cyrillic, Greek scripts to be lowercase
void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
  // On Windows, text is lowercased beforehand, so no need to do anything here.
#if !defined(CLD_WINDOWS)
  // If needed, lowercase all the text. If we do it sooner, might miss
  // lowercasing an entity such as &Aacute;
  // We only need to do this for Latn and Cyrl scripts
  if ((span->script == ULScript_Latin) ||
      (span->script == ULScript_Cyrillic) ||
      (span->script == ULScript_Greek)) {
    // Full Unicode lowercase of the entire buffer, including
    // four pad bytes off the end
    int consumed, filled;
    UniLib::ToLower(span->text, span->text_bytes + 4,
                    script_buffer_lower_, getone::kMaxScriptLowerBuffer,
                    &consumed, &filled);
    span->text = script_buffer_lower_;
    span->text_bytes = filled - 4;
  }
#endif
}

// Copy next run of same-script non-tag letters to buffer [NUL terminated]
// Force Latin and Cyrillic scripts to be lowercase
bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
  bool ok = GetOneScriptSpan(span);
  LowerScriptSpan(span);
  return ok;
}

// Gets lscript number for letters; always returns
//   0 (common script) for non-letters
int getone::GetUTF8LetterScriptNum(const char* src) {
  int srclen = cld_UniLib::OneCharLen(src);
  const uint8* usrc = reinterpret_cast<const uint8*>(src);
  return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
}
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)