// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
#define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
#include "encodings/compact_lang_det/letterscript_enum.h"
#include "encodings/compact_lang_det/compact_lang_det_impl.h"
namespace getone {
static const int kMaxScriptBuffer = 4096;
static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
static const int kMaxScriptBytes = kMaxScriptBuffer- 8; // Leave some room
static const int kMaxAnswerBuffer = 256;
typedef enum UnicodeLScript ULScript;
typedef struct {
char* text; // Pointer to the span, somewhere
int text_bytes; // Number of bytes of text in the span
int offset; // Offset of start of span in original input buffer
ULScript script; // Script of all the letters in this span
Language lang; // Language identified for this span
bool truncated; // true if buffer filled up before a
// different script or EOF was found
} LangSpan;
static inline bool IsContinuationByte(char c) {
return static_cast<signed char>(c) < -64;
}
// Gets lscript number for letters; always returns
// 0 (common script) for non-letters
int GetUTF8LetterScriptNum(const char* src);
// Update src pointer to point to next quadgram, +2..+5
// Looks at src[0..4]
const char* AdvanceQuad(const char* src);
} // end namespace getone
class ScriptScanner {
public:
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
~ScriptScanner();
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
bool GetOneScriptSpan(getone::LangSpan* span);
// Force Latin and Cyrillic scripts to be lowercase
void LowerScriptSpan(getone::LangSpan* span);
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
// Force Latin and Cyrillic scripts to be lowercase
bool GetOneScriptSpanLower(getone::LangSpan* span);
private:
int SkipToFrontOfSpan(const char* src, int len, int* script);
const char* start_byte_;
const char* next_byte_;
const char* next_byte_limit_;
int byte_length_;
bool is_plain_text_;
char* script_buffer_; // Holds text with expanded entities
char* script_buffer_lower_; // Holds lowercased text
};
class LangScanner {
public:
LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj,
getone::LangSpan* spn, int smoothwidth, int smoothcandidates,
int maxlangs, int minlangspan);
~LangScanner();
int script() {return script_;}
// Use new text
// Keep smoothing state if same script, otherwise reinit smoothing
void NewText(getone::LangSpan* spn);
bool GetOneShortLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
bool GetOneLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
// The real ones
bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
getone::LangSpan* span);
bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
getone::LangSpan* span);
// Increases language bias by delta
void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj,
Language key, int delta);
// For debugging output
int next_answer_;
char answer_buffer_[getone::kMaxAnswerBuffer];
char answer_buffer2_[getone::kMaxAnswerBuffer];
char answer_buffer3_[getone::kMaxAnswerBuffer];
char answer_buffer4_[getone::kMaxAnswerBuffer];
private:
const char* start_byte_;
const char* next_byte_limit_;
const char* next_byte_;
const char* onelangspan_begin_;
int byte_length_;
int script_;
Language spanlang_;
int smoothwidth_;
int smoothwidth_2_;
int smoothcandidates_;
int maxlangs_;
int minlangspan_;
int rb_size_;
int next_rb_;
int rb_mask_;
uint32* rb_;
int* offset_rb_;
};
#endif // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_