// Copyright (c) 2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_
#define ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_
#include <string>
#include "encodings/compact_lang_det/ext_lang_enc.h"
#include "encodings/compact_lang_det/tote.h"
#include "encodings/compact_lang_det/win/cld_basictypes.h"
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
namespace cld {
// Hash bucket for four-way associative lookup with < 64K buckets
// 32 bytes per bucket, 8-byte entries
typedef struct {
uint32 key[4]; // hashed word to look up
uint32 value[4]; // packed three lang numbers and probability subscript
} SmallWordProbBucket4;
// Hash bucket for fouro-way associative lookup with >= 64K buckets
// 24 bytes per bucket, 6-byte entries
typedef struct {
uint16 key[4]; // Half of hashed word to look up; other
// half is used to pick the bucket
uint32 value[4]; // packed three lang numbers and probability subscript
} LargeQuadProbBucket4;
// Hash bucket for four-way associative lookup, indirect probabilities
// 16 bytes per bucket, 4-byte entries
typedef struct {
uint32 keyvalue[4]; // Upper part of word is hash, lower is indirect prob
} IndirectProbBucket4;
// This describes a complete CLD table, consisting of
// a main lookup table, an indirect language/probability table, and
// three constants.
// The main table key is a quadgram, bigram, or longword hash, with
// part of the key used to select a bucket modulo kCLDTableSize,
// and the rest matched against the key portion of four entries in a bucket,
// defined by kCLDTableKeyMask. The remaining bits of an entry, defined
// by ~kCLDTableKeyMask, are usually a subscript in the indirect table.
//
// By using part of the key to select a bucket, those key bits do not need
// to be stored in the main table entries, saving space (typically 2 bytes).
//
// By using an indirect table for lang/prob triples, only the subscript needs
// to be stored in the main table entires, saving space (typically 2 bytes).
//
// Each entry in the indirect table has three languages and three
// corresponding probabilities, packed into four bytes.
//
// The build date constant is included just for version tracking and is not
// otherwise used.
//
// Different-size tables can be linked in for different production
// environments. By going indirect through this struct, the runtime code is
// insensitive to the actual sizes.
//
// An empty placeholder table can be described by a table size of 1
// bucket, a keymask of 0xffffffff, a degenerate bucket of four no-match
// entries, and a degenerate indirect table of one no-languages entry.
//
//
struct CLDTableSummary {
const IndirectProbBucket4* kCLDTable;
// Each bucket has four entries, part
// key and part indirect subscript
const uint32* kCLDTableInd; // Each entry is three packed lang/prob
const int kCLDTableSize; // Bucket count
const int kCLDTableIndSize; // Entries count
const int kCLDTableKeyMask; // Mask hash key
const int kCLDTableBuildDate; // yyyymmdd
};
// Keeps per-character 0-12 language probabilities for CTJKVZ-- in that order.
// Chinese ChineseT Japanese Korean Vietnamese Zhuang
// (2 bytes unused, for alignment padding and future)
typedef struct {
uint8 probs[8];
} UnigramProbArray;
// Map 8-bit subscript to CTJKVZ probabilities
// Target runtime probabilities for CTJK + VZ
// Hand-generated to cover a reasonable range of choices
static const int kTargetCTJKVZProbsSize = 242;
static const UnigramProbArray kTargetCTJKVZProbs[kTargetCTJKVZProbsSize] = {
{{0,0,0,0,0,0,0,0}},
{{0,0,0,0,0,12,0,0}},
{{0,0,0,0,12,0,0,0}},
{{0,0,0,12,0,0,0,0}},
{{0,0,12,0,0,0,0,0}},
{{0,12,0,0,0,0,0,0}},
{{12,0,0,0,0,0,0,0}},
{{8,0,0,0,4,0,0,0}},
{{8,0,0,4,0,0,0,0}},
{{8,0,4,0,0,0,0,0}},
{{8,4,0,0,0,0,0,0}},
{{8,2,0,2,0,0,0,0}},
{{0,0,0,0,0,8,0,0}},
{{0,4,8,0,0,0,0,0}},
{{4,0,0,0,0,8,0,0}},
{{0,0,8,0,0,0,0,0}},
{{8,2,2,0,0,0,0,0}},
{{0,8,4,0,0,0,0,0}},
{{8,0,0,0,0,4,0,0}},
{{0,8,2,0,0,0,0,0}},
{{4,8,0,0,0,0,0,0}},
{{2,8,0,2,0,0,0,0}},
{{2,2,8,0,0,0,0,0}},
{{0,8,0,0,0,0,0,0}},
{{0,2,8,0,0,0,0,0}},
{{2,8,2,0,0,0,0,0}},
{{8,0,0,0,0,0,0,0}},
{{2,8,0,0,0,0,0,0}},
{{8,2,0,0,0,0,0,0}},
{{0,6,2,0,2,0,0,0}},
{{2,0,0,0,6,0,0,0}},
{{4,0,0,0,6,0,0,0}},
{{4,6,0,0,4,0,0,0}},
{{4,6,2,0,2,0,0,0}},
{{4,6,4,0,2,0,0,0}},
{{5,4,6,0,0,0,0,0}},
{{6,0,0,0,4,0,0,0}},
{{6,0,2,0,4,0,0,0}},
{{6,0,4,0,4,0,0,0}},
{{6,2,0,0,4,0,0,0}},
{{6,2,2,0,4,0,0,0}},
{{6,2,4,0,2,0,0,0}},
{{6,4,0,0,2,0,0,0}},
{{6,4,2,0,2,0,0,0}},
{{0,0,6,2,0,0,0,0}},
{{0,6,2,0,0,2,0,0}},
{{2,2,2,0,0,6,0,0}},
{{2,2,6,4,0,0,0,0}},
{{2,4,0,0,0,6,0,0}},
{{2,6,0,4,0,0,0,0}},
{{2,6,2,4,0,0,0,0}},
{{2,6,4,4,0,0,0,0}},
{{4,0,2,0,0,6,0,0}},
{{4,2,6,2,0,0,0,0}},
{{4,4,2,0,0,6,0,0}},
{{4,6,4,0,0,2,0,0}},
{{6,0,2,0,0,2,0,0}},
{{6,2,0,0,0,2,0,0}},
{{6,2,2,0,0,4,0,0}},
{{6,2,4,0,0,2,0,0}},
{{4,6,2,0,0,4,0,0}},
{{6,4,2,0,0,4,0,0}},
{{2,0,0,0,0,6,0,0}},
{{6,2,0,2,0,0,0,0}},
{{2,2,0,0,0,6,0,0}},
{{6,2,6,0,0,0,0,0}},
{{6,4,2,0,0,2,0,0}},
{{6,4,2,2,0,0,0,0}},
{{4,6,4,2,0,0,0,0}},
{{6,0,2,0,0,4,0,0}},
{{6,0,4,0,0,2,0,0}},
{{6,0,6,0,0,0,0,0}},
{{6,2,2,0,0,0,0,0}},
{{6,4,0,0,0,2,0,0}},
{{6,4,5,0,0,0,0,0}},
{{0,6,0,2,0,0,0,0}},
{{0,6,2,2,0,0,0,0}},
{{2,6,0,2,0,0,0,0}},
{{2,6,2,2,0,0,0,0}},
{{4,2,0,0,0,6,0,0}},
{{6,4,0,0,0,4,0,0}},
{{6,4,0,2,0,0,0,0}},
{{6,6,0,2,0,0,0,0}},
{{6,0,4,0,0,4,0,0}},
{{6,2,0,0,0,4,0,0}},
{{6,6,2,2,0,0,0,0}},
{{4,6,0,0,0,2,0,0}},
{{2,6,6,0,0,0,0,0}},
{{4,5,6,0,0,0,0,0}},
{{4,6,0,2,0,0,0,0}},
{{6,2,0,0,0,6,0,0}},
{{0,6,4,2,0,0,0,0}},
{{4,0,6,0,0,0,0,0}},
{{2,6,4,2,0,0,0,0}},
{{4,6,0,0,0,4,0,0}},
{{6,2,2,0,0,0,0,0}},
{{4,6,2,2,0,0,0,0}},
{{4,6,5,0,0,0,0,0}},
{{6,0,2,0,0,0,0,0}},
{{6,4,4,0,0,0,0,0}},
{{4,2,6,0,0,0,0,0}},
{{2,0,6,0,0,0,0,0}},
{{4,4,0,0,0,6,0,0}},
{{4,4,6,0,0,0,0,0}},
{{4,6,2,0,0,2,0,0}},
{{2,2,6,0,0,0,0,0}},
{{2,4,6,0,0,0,0,0}},
{{0,6,6,0,0,0,0,0}},
{{6,2,4,0,0,0,0,0}},
{{0,4,6,0,0,0,0,0}},
{{4,0,0,0,0,6,0,0}},
{{4,6,4,0,0,0,0,0}},
{{6,0,0,0,0,6,0,0}},
{{6,0,0,0,0,2,0,0}},
{{6,0,4,0,0,0,0,0}},
{{6,5,4,0,0,0,0,0}},
{{0,2,6,0,0,0,0,0}},
{{0,0,6,0,0,0,0,0}},
{{6,6,2,0,0,0,0,0}},
{{2,6,4,0,0,0,0,0}},
{{6,4,2,0,0,0,0,0}},
{{2,6,2,0,0,0,0,0}},
{{2,6,0,0,0,0,0,0}},
{{6,0,0,0,0,4,0,0}},
{{6,4,0,0,0,0,0,0}},
{{6,6,0,0,0,0,0,0}},
{{5,6,4,0,0,0,0,0}},
{{0,6,0,0,0,0,0,0}},
{{6,2,0,0,0,0,0,0}},
{{0,6,2,0,0,0,0,0}},
{{4,6,2,0,0,0,0,0}},
{{0,6,4,0,0,0,0,0}},
{{4,6,0,0,0,0,0,0}},
{{6,0,0,0,0,0,0,0}},
{{6,6,5,0,0,0,0,0}},
{{6,5,6,0,0,0,0,0}},
{{5,6,6,0,0,0,0,0}},
{{5,5,6,0,0,0,0,0}},
{{5,6,5,0,0,0,0,0}},
{{6,5,5,0,0,0,0,0}},
{{6,6,6,0,0,0,0,0}},
{{6,5,0,0,0,0,0,0}},
{{6,0,5,0,0,0,0,0}},
{{0,6,5,0,0,0,0,0}},
{{5,6,0,0,0,0,0,0}},
{{5,0,6,0,0,0,0,0}},
{{0,5,6,0,0,0,0,0}},
{{0,0,0,0,4,0,0,0}},
{{0,0,0,4,0,0,0,0}},
{{2,2,0,0,4,0,0,0}},
{{2,2,2,0,4,0,0,0}},
{{2,4,0,0,2,0,0,0}},
{{2,4,2,0,2,0,0,0}},
{{2,4,4,0,2,0,0,0}},
{{4,0,2,0,4,0,0,0}},
{{4,0,4,0,2,0,0,0}},
{{4,2,0,0,2,0,0,0}},
{{4,2,2,0,2,0,0,0}},
{{4,4,0,0,2,0,0,0}},
{{4,4,2,0,2,0,0,0}},
{{4,4,4,0,2,0,0,0}},
{{0,2,2,4,0,0,0,0}},
{{2,2,4,2,0,0,0,0}},
{{2,4,4,0,0,2,0,0}},
{{2,4,4,2,0,0,0,0}},
{{4,0,4,0,0,2,0,0}},
{{4,0,4,0,0,4,0,0}},
{{4,2,2,4,0,0,0,0}},
{{4,4,0,2,0,0,0,0}},
{{2,2,0,4,0,0,0,0}},
{{2,4,2,2,0,0,0,0}},
{{4,4,2,2,0,0,0,0}},
{{4,0,4,0,0,0,0,0}},
{{4,4,4,0,0,4,0,0}},
{{0,4,0,2,0,0,0,0}},
{{0,4,2,2,0,0,0,0}},
{{4,0,2,0,0,2,0,0}},
{{4,2,0,0,0,4,0,0}},
{{2,2,2,0,0,4,0,0}},
{{4,0,0,2,0,0,0,0}},
{{4,4,4,0,0,2,0,0}},
{{4,0,0,0,0,4,0,0}},
{{4,0,2,0,0,4,0,0}},
{{4,2,0,0,0,2,0,0}},
{{4,2,2,0,0,2,0,0}},
{{2,4,0,2,0,0,0,0}},
{{2,2,0,0,0,4,0,0}},
{{2,4,0,0,0,4,0,0}},
{{2,4,2,0,0,4,0,0}},
{{4,2,4,0,0,0,0,0}},
{{2,0,4,0,0,0,0,0}},
{{4,0,2,0,0,0,0,0}},
{{4,4,0,0,0,4,0,0}},
{{4,4,2,0,0,4,0,0}},
{{0,4,4,0,0,0,0,0}},
{{4,4,0,0,0,2,0,0}},
{{2,4,0,0,0,2,0,0}},
{{2,2,4,0,0,0,0,0}},
{{0,2,4,0,0,0,0,0}},
{{4,2,2,0,0,0,0,0}},
{{2,4,2,0,0,2,0,0}},
{{4,4,4,0,0,0,0,0}},
{{2,4,4,0,0,0,0,0}},
{{0,0,4,0,0,0,0,0}},
{{0,4,2,0,0,0,0,0}},
{{4,4,2,0,0,2,0,0}},
{{2,4,2,0,0,0,0,0}},
{{4,2,0,0,0,0,0,0}},
{{4,4,0,0,0,0,0,0}},
{{4,4,2,0,0,0,0,0}},
{{2,4,0,0,0,0,0,0}},
{{0,4,0,0,0,0,0,0}},
{{4,0,0,0,0,0,0,0}},
{{0,0,0,4,4,0,0,0}},
{{0,0,4,0,4,0,0,0}},
{{0,0,4,4,0,0,0,0}},
{{0,4,0,0,4,0,0,0}},
{{0,4,0,4,0,0,0,0}},
{{4,0,0,0,4,0,0,0}},
{{4,0,0,4,0,0,0,0}},
{{2,0,0,0,0,0,0,0}},
{{0,2,0,0,0,0,0,0}},
{{0,2,0,2,2,0,0,0}},
{{0,2,2,0,2,0,0,0}},
{{2,0,0,2,2,0,0,0}},
{{2,0,2,0,2,0,0,0}},
{{2,0,2,2,0,0,0,0}},
{{2,2,0,0,2,0,0,0}},
{{2,2,2,2,0,0,0,0}},
{{2,2,0,2,0,0,0,0}},
{{2,2,0,0,0,0,0,0}},
{{0,0,2,0,0,0,0,0}},
{{0,2,2,0,0,0,0,0}},
{{2,2,2,0,0,0,0,0}},
{{0,0,0,2,0,0,0,0}},
{{2,0,2,0,0,0,0,0}},
{{0,2,0,2,0,0,0,0}},
{{0,0,2,2,0,0,0,0}},
{{0,2,2,2,0,0,0,0}},
};
// 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF
static const uint8 kSkipSpaceVowelContinue[256] = {
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
};
// 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF
static const uint8 kSkipSpaceContinue[256] = {
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
};
// If != UNKNOWN, use nilgrams to determine language of this script
static const Language kOnlyLanguagePerLScript[] = {
ENGLISH, // ULScript_Common, [no words should be in this script]
UNKNOWN_LANGUAGE, // ULScript_Latin,
//UNKNOWN_LANGUAGE, // ULScript_Greek, Jan 2009: change so we can score quads
GREEK, // ULScript_Greek, Mar 2009: change back; do gibberish separately
UNKNOWN_LANGUAGE, // ULScript_Cyrillic,
ARMENIAN, // ULScript_Armenian,
UNKNOWN_LANGUAGE, // ULScript_Hebrew,
UNKNOWN_LANGUAGE, // ULScript_Arabic,
SYRIAC, // ULScript_Syriac,
DHIVEHI, // ULScript_Thaana,
UNKNOWN_LANGUAGE, // ULScript_Devanagari,
UNKNOWN_LANGUAGE, // ULScript_Bengali,
PUNJABI, // ULScript_Gurmukhi,
GUJARATI, // ULScript_Gujarati,
ORIYA, // ULScript_Oriya,
TAMIL, // ULScript_Tamil,
TELUGU, // ULScript_Telugu,
KANNADA, // ULScript_Kannada,
MALAYALAM, // ULScript_Malayalam,
SINHALESE, // ULScript_Sinhala,
THAI, // ULScript_Thai,
LAOTHIAN, // ULScript_Lao,
UNKNOWN_LANGUAGE, // ULScript_Tibetan,
BURMESE, // ULScript_Myanmar,
GEORGIAN, // ULScript_Georgian,
UNKNOWN_LANGUAGE, // ULScript_HanCJK,
UNKNOWN_LANGUAGE, // ULScript_Ethiopic,
CHEROKEE, // ULScript_Cherokee,
INUKTITUT, // ULScript_Canadian_Aboriginal,
X_OGHAM, // ULScript_Ogham,
X_RUNIC, // ULScript_Runic,
KHMER, // ULScript_Khmer,
MONGOLIAN, // ULScript_Mongolian,
X_YI, // ULScript_Yi,
X_OLD_ITALIC, // ULScript_Old_Italic,
X_GOTHIC, // ULScript_Gothic,
X_DESERET, // ULScript_Deseret,
ENGLISH, // ULScript_Inherited, [no words should be in this script]
TAGALOG, // ULScript_Tagalog,
X_HANUNOO, // ULScript_Hanunoo,
X_BUHID, // ULScript_Buhid,
X_TAGBANWA, // ULScript_Tagbanwa,
LIMBU, // ULScript_Limbu,
X_TAI_LE, // ULScript_Tai_Le,
X_LINEAR_B, // ULScript_Linear_B,
X_UGARITIC, // ULScript_Ugaritic,
X_SHAVIAN, // ULScript_Shavian,
X_OSMANYA, // ULScript_Osmanya,
X_CYPRIOT, // ULScript_Cypriot,
X_BUGINESE, // ULScript_Buginese,
X_COPTIC, // ULScript_Coptic,
X_NEW_TAI_LUE, // ULScript_New_Tai_Lue,
X_GLAGOLITIC, // ULScript_Glagolitic,
X_TIFINAGH, // ULScript_Tifinagh,
X_SYLOTI_NAGRI, // ULScript_Syloti_Nagri,
X_OLD_PERSIAN, // ULScript_Old_Persian,
X_KHAROSHTHI, // ULScript_Kharoshthi,
X_BALINESE, // ULScript_Balinese,
X_CUNEIFORM, // ULScript_Cuneiform,
X_PHOENICIAN, // ULScript_Phoenician,
X_PHAGS_PA, // ULScript_Phags_Pa,
X_NKO, // ULScript_Nko,
// Unicode 5.1
X_SUDANESE, // ULScript_Sundanese,
X_LEPCHA, // ULScript_Lepcha,
X_OL_CHIKI, // ULScript_Ol_Chiki,
X_VAI, // ULScript_Vai,
X_SAURASHTRA, // ULScript_Saurashtra,
X_KAYAH_LI, // ULScript_Kayah_Li,
X_REJANG, // ULScript_Rejang,
X_LYCIAN, // ULScript_Lycian,
X_CARIAN, // ULScript_Carian,
X_LYDIAN, // ULScript_Lydian,
X_CHAM, // ULScript_Cham,
};
COMPILE_ASSERT(arraysize(kOnlyLanguagePerLScript) == ULScript_NUM_SCRIPTS,
kOnlyLanguagePerLScript_has_incorrect_length);
// This is, in a sense, the complement of the table above
// If != UNKNOWN, determines a default language of this script
static const Language kDefaultLanguagePerLScript[] = {
UNKNOWN_LANGUAGE, // ULScript_Common, [no words should be in this script]
ENGLISH, // ULScript_Latin,
UNKNOWN_LANGUAGE, // ULScript_Greek,
RUSSIAN, // ULScript_Cyrillic,
UNKNOWN_LANGUAGE, // ULScript_Armenian,
HEBREW, // ULScript_Hebrew,
ARABIC, // ULScript_Arabic,
UNKNOWN_LANGUAGE, // ULScript_Syriac,
UNKNOWN_LANGUAGE, // ULScript_Thaana,
HINDI, // ULScript_Devanagari,
BENGALI, // ULScript_Bengali,
UNKNOWN_LANGUAGE, // ULScript_Gurmukhi,
UNKNOWN_LANGUAGE, // ULScript_Gujarati,
UNKNOWN_LANGUAGE, // ULScript_Oriya,
UNKNOWN_LANGUAGE, // ULScript_Tamil,
UNKNOWN_LANGUAGE, // ULScript_Telugu,
UNKNOWN_LANGUAGE, // ULScript_Kannada,
UNKNOWN_LANGUAGE, // ULScript_Malayalam,
UNKNOWN_LANGUAGE, // ULScript_Sinhala,
UNKNOWN_LANGUAGE, // ULScript_Thai,
UNKNOWN_LANGUAGE, // ULScript_Lao,
TIBETAN, // ULScript_Tibetan,
UNKNOWN_LANGUAGE, // ULScript_Myanmar,
UNKNOWN_LANGUAGE, // ULScript_Georgian,
CHINESE, // ULScript_HanCJK,
AMHARIC, // ULScript_Ethiopic,
UNKNOWN_LANGUAGE, // ULScript_Cherokee,
UNKNOWN_LANGUAGE, // ULScript_Canadian_Aboriginal,
UNKNOWN_LANGUAGE, // ULScript_Ogham,
UNKNOWN_LANGUAGE, // ULScript_Runic,
UNKNOWN_LANGUAGE, // ULScript_Khmer,
UNKNOWN_LANGUAGE, // ULScript_Mongolian,
UNKNOWN_LANGUAGE, // ULScript_Yi,
UNKNOWN_LANGUAGE, // ULScript_Old_Italic,
UNKNOWN_LANGUAGE, // ULScript_Gothic,
UNKNOWN_LANGUAGE, // ULScript_Deseret,
UNKNOWN_LANGUAGE, // ULScript_Inherited, [no words should be in this script]
UNKNOWN_LANGUAGE, // ULScript_Tagalog,
UNKNOWN_LANGUAGE, // ULScript_Hanunoo,
UNKNOWN_LANGUAGE, // ULScript_Buhid,
UNKNOWN_LANGUAGE, // ULScript_Tagbanwa,
UNKNOWN_LANGUAGE, // ULScript_Limbu,
UNKNOWN_LANGUAGE, // ULScript_Tai_Le,
UNKNOWN_LANGUAGE, // ULScript_Linear_B,
UNKNOWN_LANGUAGE, // ULScript_Ugaritic,
UNKNOWN_LANGUAGE, // ULScript_Shavian,
UNKNOWN_LANGUAGE, // ULScript_Osmanya,
UNKNOWN_LANGUAGE, // ULScript_Cypriot,
UNKNOWN_LANGUAGE, // ULScript_Buginese,
UNKNOWN_LANGUAGE, // ULScript_Coptic,
UNKNOWN_LANGUAGE, // ULScript_New_Tai_Lue,
UNKNOWN_LANGUAGE, // ULScript_Glagolitic,
UNKNOWN_LANGUAGE, // ULScript_Tifinagh,
UNKNOWN_LANGUAGE, // ULScript_Syloti_Nagri,
UNKNOWN_LANGUAGE, // ULScript_Old_Persian,
UNKNOWN_LANGUAGE, // ULScript_Kharoshthi,
UNKNOWN_LANGUAGE, // ULScript_Balinese,
UNKNOWN_LANGUAGE, // ULScript_Cuneiform,
UNKNOWN_LANGUAGE, // ULScript_Phoenician,
UNKNOWN_LANGUAGE, // ULScript_Phags_Pa,
UNKNOWN_LANGUAGE, // ULScript_Nko,
// Unicode 5.1
UNKNOWN_LANGUAGE, // ULScript_Sundanese,
UNKNOWN_LANGUAGE, // ULScript_Lepcha,
UNKNOWN_LANGUAGE, // ULScript_Ol_Chiki,
UNKNOWN_LANGUAGE, // ULScript_Vai,
UNKNOWN_LANGUAGE, // ULScript_Saurashtra,
UNKNOWN_LANGUAGE, // ULScript_Kayah_Li,
UNKNOWN_LANGUAGE, // ULScript_Rejang,
UNKNOWN_LANGUAGE, // ULScript_Lycian,
UNKNOWN_LANGUAGE, // ULScript_Carian,
UNKNOWN_LANGUAGE, // ULScript_Lydian,
UNKNOWN_LANGUAGE, // ULScript_Cham,
};
COMPILE_ASSERT(arraysize(kDefaultLanguagePerLScript) == ULScript_NUM_SCRIPTS,
kDefaultLanguagePerLScript_has_incorrect_length);
// True for standalone languages (only lang in a script)
// Subscripted by packed language number
// If 1, we will use nilgrams to determine language
static const uint8 kIsStandaloneLang[EXT_NUM_LANGUAGES + 1] = {
0,
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,1,0, // GREEK
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
0,1,0,0,1, 0,1,0,0,0, 0,0,1,1,0, 0,0,0,0,1, // MALAYALAM..KANNADA
1,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 1,0,0,0,1, // PUNJABI..SINHALESE
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,1,1,0, // ARMENIAN..LAOTHIAN
0,0,0,0,1, 0,1,1,1,0, 1,0,0,0,0, 0,0,0,0,0, // KHMER..ORIYA
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
0,1,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, // INUKTITUT
0,0,0,0,0, // [160..164]
// Add new language standalone bit just before here
0,0,0,0,0, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1,
1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1,
1,1,1,1,
};
// True for ULScript_HanCJK
// (Vietnamese and Zhuang also have Latin script quadgrams)
// Subscripted by packed language number
static const uint8 kIsUnigramLang[EXT_NUM_LANGUAGES + 1] = {
0,
0,0,0,0,0, 0,0,0,1,1, 0,0,0,0,0, 0,1,0,0,0, // JAPANESE KOREAN CHINESE
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
0,0,0,0,0, 0,1,0,0,1, 0,0,0,0,0, 0,0,0,0,0, // VIETNAMESE CHINESE_T
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 1,0,0,0,0, // ZHUANG
0,0,0,0,0, // [160..164]
// Add new language unigram bit just before here
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, //
0,0,0,0,
};
// True for ULScript_HanCJK
// Subscripted by lscript number
static const uint8 kScoreUniPerLScript[] = {
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
};
COMPILE_ASSERT(arraysize(kScoreUniPerLScript) == ULScript_NUM_SCRIPTS,
kScoreUniPerLScript_has_incorrect_length);
// Defines Top40 packed languages
// Tier 0/1 Language enum list (16)
// ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH, // E - FIGS
// DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN,
// PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI,
// ARABIC,
//
// Tier 2 Language enum list (22)
// SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN,
// HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN,
// VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK,
// TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN,
// UKRAINIAN, HINDI,
//
// use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21)
//
// Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40
// NOTE: packed, i.e. Language enum + 1
static const uint8 kIsPackedTop40[EXT_NUM_LANGUAGES + 1] = {
0,
1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,0,
1,1,1,1,0, 1,0,1,0,0, 0,0,1,1,1, 1,0,0,1,0,
0,0,0,0,0, 0,0,0,0,0, 0,0,0,1,1, 1,0,0,0,0,
0,0,0,1,0, 0,1,0,1,1, 0,0,0,0,0, 0,0,0,0,0,
0,0,0,0,0, 0,0,0,0,0, 0,0,1,0,0, 0,0,0,0,0,
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
0,0,0,0,0, // [160..164]
// Add new language top40 bit just before here
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
0,0,0,0,
};
// Table has 234 eight-byte entries. Each entry has a five-byte array and
// a three-byte array of log base 2 probabilities in the range 0..11.
// The intended use is to express five or three probabilities in a single-byte
// subscript, then decode via this table. These probabilities are
// intended to go with an array of five or three language numbers.
//
// The corresponding language numbers will have to be sorted by descending
// probability, then the actual probability subscript chosen to match the
// closest available entry in this table.
//
// Pattern of probability values:
// hi 3/4 1/2 1/4 lo hi mid lo
// where "3/4" is (hi*3+lo)/4, "1/2" is (hi+lo)/2, and "1/4" is (hi+lo*3)/4 and
// mid is one of 3/4 1/2 or 1/4.
// There are three groups of 78 (=12*13/2) entries, with hi running 0..11 and
// lo running 0..hi. Only the first group is used for five-entry lookups.
// The mid value in the first group is 1/2, the second group 3/4, and the
// third group 1/4. For three-entry lookups, this allows the mid entry to be
// somewhat higher or lower than the midpoint, to allow a better match to the
// original probabilities.
static const int kLgProbV2TblSize = 234;
static const uint8 kLgProbV2Tbl[kLgProbV2TblSize * 8] = {
1,1,1,1,1, 1,1,1, // [0]
2,2,2,1,1, 2,2,1, // [1]
2,2,2,2,2, 2,2,2,
3,3,2,2,1, 3,2,1, // [3]
3,3,3,2,2, 3,3,2,
3,3,3,3,3, 3,3,3,
4,3,3,2,1, 4,3,1, // [6]
4,4,3,3,2, 4,3,2,
4,4,4,3,3, 4,4,3,
4,4,4,4,4, 4,4,4,
5,4,3,2,1, 5,3,1, // [10]
5,4,4,3,2, 5,4,2,
5,5,4,4,3, 5,4,3,
5,5,5,4,4, 5,5,4,
5,5,5,5,5, 5,5,5,
6,5,4,2,1, 6,4,1, // [15]
6,5,4,3,2, 6,4,2,
6,5,5,4,3, 6,5,3,
6,6,5,5,4, 6,5,4,
6,6,6,5,5, 6,6,5,
6,6,6,6,6, 6,6,6,
7,6,4,3,1, 7,4,1, // [21]
7,6,5,3,2, 7,5,2,
7,6,5,4,3, 7,5,3,
7,6,6,5,4, 7,6,4,
7,7,6,6,5, 7,6,5,
7,7,7,6,6, 7,7,6,
7,7,7,7,7, 7,7,7,
8,6,5,3,1, 8,5,1, // [28]
8,7,5,4,2, 8,5,2,
8,7,6,4,3, 8,6,3,
8,7,6,5,4, 8,6,4,
8,7,7,6,5, 8,7,5,
8,8,7,7,6, 8,7,6,
8,8,8,7,7, 8,8,7,
8,8,8,8,8, 8,8,8,
9,7,5,3,1, 9,5,1, // [36]
9,7,6,4,2, 9,6,2,
9,8,6,5,3, 9,6,3,
9,8,7,5,4, 9,7,4,
9,8,7,6,5, 9,7,5,
9,8,8,7,6, 9,8,6,
9,9,8,8,7, 9,8,7,
9,9,9,8,8, 9,9,8,
9,9,9,9,9, 9,9,9,
10,8,6,3,1, 10,6,1, // [45]
10,8,6,4,2, 10,6,2,
10,8,7,5,3, 10,7,3,
10,9,7,6,4, 10,7,4,
10,9,8,6,5, 10,8,5,
10,9,8,7,6, 10,8,6,
10,9,9,8,7, 10,9,7,
10,10,9,9,8, 10,9,8,
10,10,10,9,9, 10,10,9,
10,10,10,10,10, 10,10,10,
11,9,6,4,1, 11,6,1, // [55]
11,9,7,4,2, 11,7,2,
11,9,7,5,3, 11,7,3,
11,9,8,6,4, 11,8,4,
11,10,8,7,5, 11,8,5,
11,10,9,7,6, 11,9,6,
11,10,9,8,7, 11,9,7,
11,10,10,9,8, 11,10,8,
11,11,10,10,9, 11,10,9,
11,11,11,10,10, 11,11,10,
11,11,11,11,11, 11,11,11,
12,9,7,4,1, 12,7,1, // [66]
12,10,7,5,2, 12,7,2,
12,10,8,5,3, 12,8,3,
12,10,8,6,4, 12,8,4,
12,10,9,7,5, 12,9,5,
12,11,9,8,6, 12,9,6,
12,11,10,8,7, 12,10,7,
12,11,10,9,8, 12,10,8,
12,11,11,10,9, 12,11,9,
12,12,11,11,10, 12,11,10,
12,12,12,11,11, 12,12,11,
12,12,12,12,12, 12,12,12,
1,1,1,1,1, 1,1,1,
2,2,2,1,1, 2,2,1,
2,2,2,2,2, 2,2,2,
3,3,2,2,1, 3,3,1,
3,3,3,2,2, 3,3,2,
3,3,3,3,3, 3,3,3,
4,3,3,2,1, 4,3,1,
4,4,3,3,2, 4,4,2,
4,4,4,3,3, 4,4,3,
4,4,4,4,4, 4,4,4,
5,4,3,2,1, 5,4,1,
5,4,4,3,2, 5,4,2,
5,5,4,4,3, 5,5,3,
5,5,5,4,4, 5,5,4,
5,5,5,5,5, 5,5,5,
6,5,4,2,1, 6,5,1,
6,5,4,3,2, 6,5,2,
6,5,5,4,3, 6,5,3,
6,6,5,5,4, 6,6,4,
6,6,6,5,5, 6,6,5,
6,6,6,6,6, 6,6,6,
7,6,4,3,1, 7,6,1,
7,6,5,3,2, 7,6,2,
7,6,5,4,3, 7,6,3,
7,6,6,5,4, 7,6,4,
7,7,6,6,5, 7,7,5,
7,7,7,6,6, 7,7,6,
7,7,7,7,7, 7,7,7,
8,6,5,3,1, 8,6,1,
8,7,5,4,2, 8,7,2,
8,7,6,4,3, 8,7,3,
8,7,6,5,4, 8,7,4,
8,7,7,6,5, 8,7,5,
8,8,7,7,6, 8,8,6,
8,8,8,7,7, 8,8,7,
8,8,8,8,8, 8,8,8,
9,7,5,3,1, 9,7,1,
9,7,6,4,2, 9,7,2,
9,8,6,5,3, 9,8,3,
9,8,7,5,4, 9,8,4,
9,8,7,6,5, 9,8,5,
9,8,8,7,6, 9,8,6,
9,9,8,8,7, 9,9,7,
9,9,9,8,8, 9,9,8,
9,9,9,9,9, 9,9,9,
10,8,6,3,1, 10,8,1,
10,8,6,4,2, 10,8,2,
10,8,7,5,3, 10,8,3,
10,9,7,6,4, 10,9,4,
10,9,8,6,5, 10,9,5,
10,9,8,7,6, 10,9,6,
10,9,9,8,7, 10,9,7,
10,10,9,9,8, 10,10,8,
10,10,10,9,9, 10,10,9,
10,10,10,10,10, 10,10,10,
11,9,6,4,1, 11,9,1,
11,9,7,4,2, 11,9,2,
11,9,7,5,3, 11,9,3,
11,9,8,6,4, 11,9,4,
11,10,8,7,5, 11,10,5,
11,10,9,7,6, 11,10,6,
11,10,9,8,7, 11,10,7,
11,10,10,9,8, 11,10,8,
11,11,10,10,9, 11,11,9,
11,11,11,10,10, 11,11,10,
11,11,11,11,11, 11,11,11,
12,9,7,4,1, 12,9,1,
12,10,7,5,2, 12,10,2,
12,10,8,5,3, 12,10,3,
12,10,8,6,4, 12,10,4,
12,10,9,7,5, 12,10,5,
12,11,9,8,6, 12,11,6,
12,11,10,8,7, 12,11,7,
12,11,10,9,8, 12,11,8,
12,11,11,10,9, 12,11,9,
12,12,11,11,10, 12,12,10,
12,12,12,11,11, 12,12,11,
12,12,12,12,12, 12,12,12,
1,1,1,1,1, 1,1,1,
2,2,2,1,1, 2,1,1,
2,2,2,2,2, 2,2,2,
3,3,2,2,1, 3,2,1,
3,3,3,2,2, 3,2,2,
3,3,3,3,3, 3,3,3,
4,3,3,2,1, 4,2,1,
4,4,3,3,2, 4,3,2,
4,4,4,3,3, 4,3,3,
4,4,4,4,4, 4,4,4,
5,4,3,2,1, 5,2,1,
5,4,4,3,2, 5,3,2,
5,5,4,4,3, 5,4,3,
5,5,5,4,4, 5,4,4,
5,5,5,5,5, 5,5,5,
6,5,4,2,1, 6,2,1,
6,5,4,3,2, 6,3,2,
6,5,5,4,3, 6,4,3,
6,6,5,5,4, 6,5,4,
6,6,6,5,5, 6,5,5,
6,6,6,6,6, 6,6,6,
7,6,4,3,1, 7,3,1,
7,6,5,3,2, 7,3,2,
7,6,5,4,3, 7,4,3,
7,6,6,5,4, 7,5,4,
7,7,6,6,5, 7,6,5,
7,7,7,6,6, 7,6,6,
7,7,7,7,7, 7,7,7,
8,6,5,3,1, 8,3,1,
8,7,5,4,2, 8,4,2,
8,7,6,4,3, 8,4,3,
8,7,6,5,4, 8,5,4,
8,7,7,6,5, 8,6,5,
8,8,7,7,6, 8,7,6,
8,8,8,7,7, 8,7,7,
8,8,8,8,8, 8,8,8,
9,7,5,3,1, 9,3,1,
9,7,6,4,2, 9,4,2,
9,8,6,5,3, 9,5,3,
9,8,7,5,4, 9,5,4,
9,8,7,6,5, 9,6,5,
9,8,8,7,6, 9,7,6,
9,9,8,8,7, 9,8,7,
9,9,9,8,8, 9,8,8,
9,9,9,9,9, 9,9,9,
10,8,6,3,1, 10,3,1,
10,8,6,4,2, 10,4,2,
10,8,7,5,3, 10,5,3,
10,9,7,6,4, 10,6,4,
10,9,8,6,5, 10,6,5,
10,9,8,7,6, 10,7,6,
10,9,9,8,7, 10,8,7,
10,10,9,9,8, 10,9,8,
10,10,10,9,9, 10,9,9,
10,10,10,10,10, 10,10,10,
11,9,6,4,1, 11,4,1,
11,9,7,4,2, 11,4,2,
11,9,7,5,3, 11,5,3,
11,9,8,6,4, 11,6,4,
11,10,8,7,5, 11,7,5,
11,10,9,7,6, 11,7,6,
11,10,9,8,7, 11,8,7,
11,10,10,9,8, 11,9,8,
11,11,10,10,9, 11,10,9,
11,11,11,10,10, 11,10,10,
11,11,11,11,11, 11,11,11,
12,9,7,4,1, 12,4,1,
12,10,7,5,2, 12,5,2,
12,10,8,5,3, 12,5,3,
12,10,8,6,4, 12,6,4,
12,10,9,7,5, 12,7,5,
12,11,9,8,6, 12,8,6,
12,11,10,8,7, 12,8,7,
12,11,10,9,8, 12,9,8,
12,11,11,10,9, 12,10,9,
12,12,11,11,10, 12,11,10,
12,12,12,11,11, 12,11,11,
12,12,12,12,12, 12,12,12,
};
// Backmap a single desired probability into an entry in kLgProbV2Tbl
static const uint8 kLgProbV2TblBackmap[13] = {
0,
0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66,
};
// Always advances one UTF-8 character
static const uint8 kAdvanceOneChar[256] = {
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
};
// Does not advance past space or cr/lf/nul
static const uint8 kAdvanceOneCharButSpace[256] = {
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
};
// Advances *only* on space or ASCII vowel (or illegal byte)
static const uint8 kAdvanceOneCharSpaceVowel[256] = {
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
};
// Advances *only* on space (or illegal byte)
static const uint8 kAdvanceOneCharSpace[256] = {
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
};
//------------------------------------------------------------------------------
// General
//------------------------------------------------------------------------------
static inline int minint(int a, int b) {return (a < b) ? a: b;}
static inline int maxint(int a, int b) {return (a > b) ? a: b;}
// Here to make available for debugging
int ReliabilityDelta(int value1, int value2, int count);
int ReliabilityMainstream(int topscore, int len, int mean_score);
// Returns "0" for too small
inline const char* MyExtLanguageCode(Language lang) {
return ExtLanguageCode(lang);
}
// Map script into Latin, Cyrillic, Arabic, Other. Used in keeping track of
// amount of training data for language-script combinations
inline int LScript4(UnicodeLScript lscript) {
if (lscript == ULScript_Latin) {return 0;}
if (lscript == ULScript_Cyrillic) {return 1;}
if (lscript == ULScript_Arabic) {return 2;}
return 3;
}
// Routines to access 3 or 5 log probabilities in a single byte.
// Return address of 8-byte entry[i]
inline const uint8* LgProb2TblEntry(int i) {
return &kLgProbV2Tbl[i * 8];
}
// Return one of five probabilities in an entry
// CURRENTLY UNUSED
inline uint8 LgProb5(const uint8* entry, int j) {
return entry[j];
}
// Return one of three probabilities in an entry
inline uint8 LgProb3(const uint8* entry, int j) {
return entry[j + 5];
}
//------------------------------------------------------------------------------
// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
//------------------------------------------------------------------------------
// Pick up 1..12 bytes and hash them via mask/shift/add. NO pre/post
// OVERSHOOTS up to 3 bytes
uint32 BiHashV25(const char* word_ptr, int bytecount);
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
uint32 QuadHashV25(const char* word_ptr, int bytecount);
// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
// OVERSHOOTS up to 3 bytes
uint32 QuadHashV25Underscore(const char* word_ptr, int bytecount);
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
// For runtime use of tables V3
uint64 OctaHash40(const char* word_ptr, int bytecount);
uint64 OctaHash40underscore(const char* word_ptr, int bytecount);
// From 32-bit gram FP, return hash table subscript and remaining key
inline void QuadFPJustHash(uint32 quadhash,
uint32 keymask,
int bucketcount,
uint32* subscr, uint32* hashkey) {
*subscr = (quadhash + (quadhash >> 12)) & (bucketcount - 1);
*hashkey = quadhash & keymask;
}
// Look up 32-bit gram FP in caller-passed table
// Typical size 256K entries (1.5MB)
// Two-byte hashkey
inline const uint32 QuadHashV3Lookup4(const cld::CLDTableSummary* gram_obj,
uint32 quadhash) {
uint32 subscr, hashkey;
const IndirectProbBucket4* quadtable = gram_obj->kCLDTable;
uint32 keymask = gram_obj->kCLDTableKeyMask;
int bucketcount = gram_obj->kCLDTableSize;
QuadFPJustHash(quadhash, keymask, bucketcount, &subscr, &hashkey);
const IndirectProbBucket4* bucket_ptr = &quadtable[subscr];
// Four-way associative, 4 compares
if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
return bucket_ptr->keyvalue[0];
}
if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
return bucket_ptr->keyvalue[1];
}
if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
return bucket_ptr->keyvalue[2];
}
if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
return bucket_ptr->keyvalue[3];
}
return 0;
}
// Map 40 bits to subscript, hashkey, expected 18-22 bit subscript (min 16)
// wwwwwwww xxxxxxxx xxxxxxxx yyyyyyyy yyyyyyyy
// + ........ ....wwww wwwwxxxx xxxxxxxx xxxxyyyy
// 00000000 00000000 00000011 11111111 11111111 (18-bit bucketcount-1)
//
// hashkey:
// wwwwxxxx xxxxxxxx xxxx.... ........ (20-bit keymask)
// 12-bit shift in subscript mixes in ~4 letters x 4 bits each
// From 40-bit gram FP, return hash table subscript and remaining key
inline void OctaFPJustHash(uint64 longwordhash,
uint32 keymask,
int bucketcount,
uint32* subscr, uint32* hashkey) {
uint32 temp = (longwordhash + (longwordhash >> 12)) & (bucketcount - 1);
*subscr = temp;
temp = longwordhash >> 4;
*hashkey = temp & keymask;
}
// Look up 40-bit gram FP in caller-passed table
// Typical size 256K-4M entries (1-16MB)
// 24-12 bit hashkey packed with 8-20 bit indirect lang/probs
// keymask is 0xfffff000 for 20-bit hashkey and 12-bit indirect
inline const uint32 OctaHashV3Lookup4(const cld::CLDTableSummary* gram_obj,
uint64 longwordhash) {
uint32 subscr, hashkey;
const IndirectProbBucket4* octatable = gram_obj->kCLDTable;
uint32 keymask = gram_obj->kCLDTableKeyMask;
int bucketcount = gram_obj->kCLDTableSize;
OctaFPJustHash(longwordhash, keymask, bucketcount,
&subscr, &hashkey);
const IndirectProbBucket4* bucket_ptr = &octatable[subscr];
// Four-way associative, 4 compares
if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
return bucket_ptr->keyvalue[0];
}
if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
return bucket_ptr->keyvalue[1];
}
if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
return bucket_ptr->keyvalue[2];
}
if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
return bucket_ptr->keyvalue[3];
}
return 0;
}
//------------------------------------------------------------------------------
// Scoring single groups of letters
//------------------------------------------------------------------------------
// UNIGRAM score one => tote
// Input: 1-byte entry of subscript into unigram probs, plus
// an accumulator tote.
// Output: running sums in tote updated
void ProcessProbV25UniTote(int propval, Tote* tote);
// BIGRAM, QUADGRAM, OCTAGRAM score one => tote
// Input: 4-byte entry of 3 language numbers and one probability subscript,
// plus an accumulator tote. (language 0 means unused entry)
// Output: running sums in tote updated
void ProcessProbV25Tote(uint32 probs, Tote* tote);
//------------------------------------------------------------------------------
// Routines to accumulate probabilities
//------------------------------------------------------------------------------
// Score up to n=gram_limit unigrams, returning number of bytes consumed
// Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
int DoUniScoreV3(const UTF8PropObj* unigram_obj,
const char* isrc, int srclen, int advance_by,
int* tote_grams, int gram_limit, Tote* chunk_tote);
// Score all words in isrc, using languages that have bigrams (CJK)
// Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
// Return number of bigrams that hit in the hash table
int DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
const char* isrc, int srclen, Tote* chunk_tote);
// Score up to n=gram_limit quadgrams, returning number of bytes consumed
// Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
int DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
const char* isrc, int srclen, int advance_by,
int* tote_grams, int gram_limit, Tote* chunk_tote);
// Score all octagrams (words) in isrc, using languages that have quadgrams
// Caller supplies table, such as &kLongWord8Table_obj
// Return number of words that hit in the hash table
int DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
const char* isrc, int srclen, Tote* chunk_tote);
//------------------------------------------------------------------------------
// Reliability calculations, for single language and between languages
//------------------------------------------------------------------------------
// Reliability = 0..100
static const int kMinReliable = 75;
// Calculate ratio of score per 1KB vs. expected score per 1KB
double GetNormalizedScore(Language lang, UnicodeLScript lscript,
int bytes, int score);
// Calculate reliablity of len bytes of script lscript with chunk_tote
int GetReliability(int len, UnicodeLScript lscript, const Tote* chunk_tote);
//------------------------------------------------------------------------------
// Miscellaneous
//------------------------------------------------------------------------------
// Make languages packed into uint32 values non-zero
// These routines later could remap so languages not in QuadHash tables are not
// represented, and so that any thrashing in accumulation is eliminated
uint8 inline PackLanguage(Language lang) {
return static_cast<uint8>(lang + 1);}
Language inline UnpackLanguage(int ilang) {
return static_cast<Language>(ilang - 1);}
// Useful single-byte tests
bool inline IsUTF8ContinueByte(char c) {
return static_cast<signed char>(c) < -64;}
bool inline IsUTF8HighByte(char c) {
return static_cast<signed char>(c) < 0;}
// Demote all languages except Top40 and plus_one
// Do this just before sorting
void DemoteNotTop40(Tote* chunk_tote, int packed_plus_one);
} // End namespace cld
#endif // ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_