// Copyright (c) 2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This file extends lang_enc.cc with additional languages and extended routines
// It is current with Unicode 5.1 (beta Jan 2008)
//
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "encodings/compact_lang_det/ext_lang_enc.h"
#include "encodings/compact_lang_det/win/cld_macros.h"
#include "encodings/compact_lang_det/win/cld_strtoint.h"
// Language names above NUM_LANGUAGES
// These are also the C enum declared names
static const char* const kExtLanguageName[] = {
"X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
// Pseudo-languages for Unicode scripts that express a single language
"X_OGHAM", "X_RUNIC", "X_YI", "X_OLD_ITALIC", "X_GOTHIC",
"X_DESERET", "X_HANUNOO", "X_BUHID", "X_TAGBANWA", "X_TAI_LE",
"X_LINEAR_B", "X_UGARITIC", "X_SHAVIAN", "X_OSMANYA", "X_CYPRIOT",
"X_BUGINESE", "X_COPTIC", "X_NEW_TAI_LUE", "X_GLAGOLITIC", "X_TIFINAGH",
"X_SYLOTI_NAGRI", "X_OLD_PERSIAN", "X_KHAROSHTHI", "X_BALINESE", "X_CUNEIFORM",
"X_PHOENICIAN", "X_PHAGS_PA", "X_NKO",
// Unicode 5.1
"X_SUDANESE", "X_LEPCHA", "X_OL_CHIKI", "X_VAI", "X_SAURASHTRA",
"X_KAYAH_LI", "X_REJANG", "X_LYCIAN", "X_CARIAN", "X_LYDIAN",
"X_CHAM",
};
// These are the C enum declared names, for programs creating C code
static const char* const kExtLangDeclaredName[] = {
"ENGLISH", /* 0 */
"DANISH", /* 1 */
"DUTCH", /* 2 */
"FINNISH", /* 3 */
"FRENCH", /* 4 */
"GERMAN", /* 5 */
"HEBREW", /* 6 */
"ITALIAN", /* 7 */
"JAPANESE", /* 8 */
"KOREAN", /* 9 */
"NORWEGIAN", /* 10 */
"POLISH", /* 11 */
"PORTUGUESE", /* 12 */
"RUSSIAN", /* 13 */
"SPANISH", /* 14 */
"SWEDISH", /* 15 */
"CHINESE", /* 16 */
"CZECH", /* 17 */
"GREEK", /* 18 */
"ICELANDIC", /* 19 */
"LATVIAN", /* 20 */
"LITHUANIAN", /* 21 */
"ROMANIAN", /* 22 */
"HUNGARIAN", /* 23 */
"ESTONIAN", /* 24 */
"TG_UNKNOWN_LANGUAGE", /* 25 */
"UNKNOWN_LANGUAGE", /* 26 */
"BULGARIAN", /* 27 */
"CROATIAN", /* 28 */
"SERBIAN", /* 29 */
"IRISH", /* 30 */
"GALICIAN", /* 31 */
"TAGALOG", /* 32 */
"TURKISH", /* 33 */
"UKRAINIAN", /* 34 */
"HINDI", /* 35 */
"MACEDONIAN", /* 36 */
"BENGALI", /* 37 */
"INDONESIAN", /* 38 */
"LATIN", /* 39 */
"MALAY", /* 40 */
"MALAYALAM", /* 41 */
"WELSH", /* 42 */
"NEPALI", /* 43 */
"TELUGU", /* 44 */
"ALBANIAN", /* 45 */
"TAMIL", /* 46 */
"BELARUSIAN", /* 47 */
"JAVANESE", /* 48 */
"OCCITAN", /* 49 */
"URDU", /* 50 */
"BIHARI", /* 51 */
"GUJARATI", /* 52 */
"THAI", /* 53 */
"ARABIC", /* 54 */
"CATALAN", /* 55 */
"ESPERANTO", /* 56 */
"BASQUE", /* 57 */
"INTERLINGUA", /* 58 */
"KANNADA", /* 59 */
"PUNJABI", /* 60 */
"SCOTS_GAELIC", /* 61 */
"SWAHILI", /* 62 */
"SLOVENIAN", /* 63 */
"MARATHI", /* 64 */
"MALTESE", /* 65 */
"VIETNAMESE", /* 66 */
"FRISIAN", /* 67 */
"SLOVAK", /* 68 */
"CHINESE_T", /* 69 */
"FAROESE", /* 70 */
"SUNDANESE", /* 71 */
"UZBEK", /* 72 */
"AMHARIC", /* 73 */
"AZERBAIJANI", /* 74 */
"GEORGIAN", /* 75 */
"TIGRINYA", /* 76 */
"PERSIAN", /* 77 */
"BOSNIAN", /* 78 */
"SINHALESE", /* 79 */
"NORWEGIAN_N", /* 80 */
"PORTUGUESE_P", /* 81 */
"PORTUGUESE_B", /* 82 */
"XHOSA", /* 83 */
"ZULU", /* 84 */
"GUARANI", /* 85 */
"SESOTHO", /* 86 */
"TURKMEN", /* 87 */
"KYRGYZ", /* 88 */
"BRETON", /* 89 */
"TWI", /* 90 */
"YIDDISH", /* 91 */
"SERBO_CROATIAN", /* 92 */
"SOMALI", /* 93 */
"UIGHUR", /* 94 */
"KURDISH", /* 95 */
"MONGOLIAN", /* 96 */
"ARMENIAN", /* 97 */
"LAOTHIAN", /* 98 */
"SINDHI", /* 99 */
"RHAETO_ROMANCE", /* 100 */
"AFRIKAANS", /* 101 */
"LUXEMBOURGISH", /* 102 */
"BURMESE", /* 103 */
"KHMER", /* 104 */
"TIBETAN", /* 105 */
"DHIVEHI", /* 106 */ // sometimes spelled Divehi; lang of Maldives
"CHEROKEE", /* 107 */
"SYRIAC", /* 108 */
"LIMBU", /* 109 */
"ORIYA", /* 110 */
"ASSAMESE", /* 111 */
"CORSICAN", /* 112 */
"INTERLINGUE", /* 113 */
"KAZAKH", /* 114 */
"LINGALA", /* 115 */
"MOLDAVIAN", /* 116 */
"PASHTO", /* 117 */
"QUECHUA", /* 118 */
"SHONA", /* 119 */
"TAJIK", /* 120 */
"TATAR", /* 121 */
"TONGA", /* 122 */
"YORUBA", /* 123 */
"CREOLES_AND_PIDGINS_ENGLISH_BASED", /* 124 */
"CREOLES_AND_PIDGINS_FRENCH_BASED", /* 125 */
"CREOLES_AND_PIDGINS_PORTUGUESE_BASED", /* 126 */
"CREOLES_AND_PIDGINS_OTHER", /* 127 */
"MAORI", /* 128 */
"WOLOF", /* 129 */
"ABKHAZIAN", /* 130 */
"AFAR", /* 131 */
"AYMARA", /* 132 */
"BASHKIR", /* 133 */
"BISLAMA", /* 134 */
"DZONGKHA", /* 135 */
"FIJIAN", /* 136 */
"GREENLANDIC", /* 137 */
"HAUSA", /* 138 */
"HAITIAN_CREOLE", /* 139 */
"INUPIAK", /* 140 */
"INUKTITUT", /* 141 */
"KASHMIRI", /* 142 */
"KINYARWANDA", /* 143 */
"MALAGASY", /* 144 */
"NAURU", /* 145 */
"OROMO", /* 146 */
"RUNDI", /* 147 */
"SAMOAN", /* 148 */
"SANGO", /* 149 */
"SANSKRIT", /* 150 */
"SISWANT", /* 151 */
"TSONGA", /* 152 */
"TSWANA", /* 153 */
"VOLAPUK", /* 154 */
"ZHUANG", /* 155 */
"KHASI", /* 156 */
"SCOTS", /* 157 */
"GANDA", /* 158 */
"MANX", /* 159 */
"MONTENEGRIN", /* 160 */
// Add new language declared names just before here
};
COMPILE_ASSERT(arraysize(kExtLangDeclaredName) == NUM_LANGUAGES,
kExtLangDeclaredName_has_incorrect_length);
// Language codes above NUM_LANGUAGES
// I made all these up, except Klingon from ISO-639-2 (dsites)
// NOTE: zza is a standard name
static const char* const kExtLanguageCode[] = {
// "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
// All Latin script
"zzb", "zzp", "zzh", "tlh", "zze",
// Pseudo-languages for Unicode scripts that express a single language
"xx-Ogam", "xx-Runr", "xx-Yiii", "xx-Ital", "xx-Goth",
"xx-Dsrt", "xx-Hano", "xx-Buhd", "xx-Tagb", "xx-Tale",
"xx-Linb", "xx-Ugar", "xx-Shaw", "xx-Osma", "xx-Cprt",
"xx-Bugi", "xx-Copt", "xx-Talu", "xx-Glag", "xx-Tfng",
"xx-Sylo", "xx-Xpeo", "xx-Khar", "xx-Bali", "xx-Xsux",
"xx-Phnx", "xx-Phag", "xx-Nkoo",
// Unicode 5.1
"xx-Sund", "xx-Lepc", "xx-Olck", "xx-Vaii", "xx-Saur",
"xx-Kali", "xx-Rjng", "xx-Lyci", "xx-Cari", "xx-Lydi",
"xx-Cham",
};
// Given the Language, returns its string name used as the output by
// the lang/enc identifier, e.g. "Korean"
// "invalid_language" if the input is invalid.
// TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
// used to subtract out HTML, link farms, DNA strings, and alittle English porn
const char* ExtLanguageName(const Language lang) {
if (lang < 0) {
// No-text-at-all result from a Tote
return "";
}
// CompactLanguageDetect extension
if (lang == TG_UNKNOWN_LANGUAGE) {
return "Ignore";
}
if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
return LanguageName(lang);
}
if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
}
return invalid_language_name();
}
// Given the Language, returns its Language enum spelling, for use by
// programs that create C declarations, e.g. "KOREAN"
// "UNKNOWN_LANGUAGE" if the input is invalid.
const char* ExtLanguageDeclaredName(const Language lang) {
if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
return kExtLangDeclaredName[lang];
}
if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
}
return "UNKNOWN_LANGUAGE";
}
// Given the Language, return the language code, e.g. "ko"
const char* ExtLanguageCode(const Language lang) {
// Hack for ignore/porn pseudo-language
if (lang == TG_UNKNOWN_LANGUAGE) {
return "xxx";
}
if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
return LanguageCode(lang);
}
if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
return kExtLanguageCode[lang - EXT_LANGUAGE_BASE];
}
return "??";
}
// Convert "en-Latn-GB" to ENGLISH
// Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P
// Consider for later: NORWEGIAN, NORWEGIAN_N
// Consider for later: SCOTS, SCOTS_GAELIC
// Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN
//
Language GetLanguageFromNumberOrName(const char* src) {
if (strspn(src, "0123456789") == strlen(src)) {
// All digits
return static_cast<Language>(strto32(src, NULL, 10));
}
Language retlang = UNKNOWN_LANGUAGE;
size_t len = strlen(src);
if (true /*FLAGS_mergepairs*/) {
// Merge sets of langauges pt-xx en-xx fr-xx, NOT bs/hr/sr
if (memcmp(src, "pt-", 3) == 0) {return PORTUGUESE;}
if (memcmp(src, "en-", 3) == 0) {return ENGLISH;}
if (memcmp(src, "fr-", 3) == 0) {return FRENCH;}
// Use NormalizeLanguage instead
if (memcmp(src, "bs-", 3) == 0) {return CROATIAN;}
if (memcmp(src, "hr-", 3) == 0) {return CROATIAN;}
if (memcmp(src, "sr-Latn", 7) == 0) {return CROATIAN;}
if (memcmp(src, "sh-Latn", 7) == 0) {return CROATIAN;}
if (memcmp(src, "sr-Cyrl", 7) == 0) {return SERBIAN;}
if (memcmp(src, "sh-Cyrl", 7) == 0) {return SERBIAN;}
}
// Extensions
if (len >= 3) {
// Standin for ignore/porn "language"
if (memcmp(src, "xxx", 3) == 0) {return TG_UNKNOWN_LANGUAGE;}
if (memcmp(src, "zzb", 3) == 0) {return X_BORK_BORK_BORK;}
if (memcmp(src, "zzp", 3) == 0) {return X_PIG_LATIN;}
if (memcmp(src, "zzh", 3) == 0) {return X_HACKER;}
if (memcmp(src, "tlh", 3) == 0) {return X_KLINGON;}
if (memcmp(src, "zze", 3) == 0) {return X_ELMER_FUDD;}
}
// We have a name like en-Latn-GB or pt-BR
// First, get rid of some special cases
if (len <= 3) {
LanguageFromCode(src, &retlang);
} else if (len == 7) {
// More Extensions
if (memcmp(src, "xx-", 3) == 0) {
if (memcmp(src, "xx-Ogam", 7) == 0) {return X_OGHAM;}
if (memcmp(src, "xx-Runr", 7) == 0) {return X_RUNIC;}
if (memcmp(src, "xx-Yiii", 7) == 0) {return X_YI;}
if (memcmp(src, "xx-Ital", 7) == 0) {return X_OLD_ITALIC;}
if (memcmp(src, "xx-Goth", 7) == 0) {return X_GOTHIC;}
if (memcmp(src, "xx-Dsrt", 7) == 0) {return X_DESERET;}
if (memcmp(src, "xx-Hano", 7) == 0) {return X_HANUNOO;}
if (memcmp(src, "xx-Buhd", 7) == 0) {return X_BUHID;}
if (memcmp(src, "xx-Tagb", 7) == 0) {return X_TAGBANWA;}
if (memcmp(src, "xx-Tale", 7) == 0) {return X_TAI_LE;}
if (memcmp(src, "xx-Linb", 7) == 0) {return X_LINEAR_B;}
if (memcmp(src, "xx-Ugar", 7) == 0) {return X_UGARITIC;}
if (memcmp(src, "xx-Shaw", 7) == 0) {return X_SHAVIAN;}
if (memcmp(src, "xx-Osma", 7) == 0) {return X_OSMANYA;}
if (memcmp(src, "xx-Cprt", 7) == 0) {return X_CYPRIOT;}
if (memcmp(src, "xx-Bugi", 7) == 0) {return X_BUGINESE;}
if (memcmp(src, "xx-Copt", 7) == 0) {return X_COPTIC;}
if (memcmp(src, "xx-Talu", 7) == 0) {return X_NEW_TAI_LUE;}
if (memcmp(src, "xx-Glag", 7) == 0) {return X_GLAGOLITIC;}
if (memcmp(src, "xx-Tfng", 7) == 0) {return X_TIFINAGH;}
if (memcmp(src, "xx-Sylo", 7) == 0) {return X_SYLOTI_NAGRI;}
if (memcmp(src, "xx-Xpeo", 7) == 0) {return X_OLD_PERSIAN;}
if (memcmp(src, "xx-Khar", 7) == 0) {return X_KHAROSHTHI;}
if (memcmp(src, "xx-Bali", 7) == 0) {return X_BALINESE;}
if (memcmp(src, "xx-Xsux", 7) == 0) {return X_CUNEIFORM;}
if (memcmp(src, "xx-Phnx", 7) == 0) {return X_PHOENICIAN;}
if (memcmp(src, "xx-Phag", 7) == 0) {return X_PHAGS_PA;}
if (memcmp(src, "xx-Nkoo", 7) == 0) {return X_NKO;}
// Unicode 5.1
if (memcmp(src, "xx-Sund", 7) == 0) {return X_SUDANESE;}
if (memcmp(src, "xx-Lepc", 7) == 0) {return X_LEPCHA;}
if (memcmp(src, "xx-Olck", 7) == 0) {return X_OL_CHIKI;}
if (memcmp(src, "xx-Vaii", 7) == 0) {return X_VAI;}
if (memcmp(src, "xx-Saur", 7) == 0) {return X_SAURASHTRA;}
if (memcmp(src, "xx-Kali", 7) == 0) {return X_KAYAH_LI;}
if (memcmp(src, "xx-Rjng", 7) == 0) {return X_REJANG;}
if (memcmp(src, "xx-Lyci", 7) == 0) {return X_LYCIAN;}
if (memcmp(src, "xx-Cari", 7) == 0) {return X_CARIAN;}
if (memcmp(src, "xx-Lydi", 7) == 0) {return X_LYDIAN;}
if (memcmp(src, "xx-Cham", 7) == 0) {return X_CHAM;}
}
}
// Some other weird ones
// Could be Latn or Limb; all our current training data is Latn
if (strcmp(src, "sit-NP") == 0) {return LIMBU;}
if (strcmp(src, "un-Latn") == 0) {return UNKNOWN_LANGUAGE;}
// Multi-country langauges
if (memcmp(src, "zh", 2) == 0) {
if (memcmp(&src[len - 2], "TW", 2) == 0) {return CHINESE_T;}
if (memcmp(&src[len - 2], "HK", 2) == 0) {return CHINESE_T;}
return CHINESE;
}
if (memcmp(src, "pt", 2) == 0) {
if (memcmp(&src[len - 2], "BR", 2) == 0) {return PORTUGUESE;}
return PORTUGUESE;
}
if (memcmp(src, "fr", 2) == 0) {
if (memcmp(&src[len -2], "CA", 2) == 0) {return FRENCH;}
return FRENCH;
}
// None of the special cases matched
if (src[2] == '-') {
char temp[4];
memcpy(temp, src, 4);
temp[2] = '\0';
LanguageFromCode(temp, &retlang);
}
if (src[3] == '-') {
char temp[4];
memcpy(temp, src, 4);
temp[3] = '\0';
LanguageFromCode(temp, &retlang);
}
if (retlang != UNKNOWN_LANGUAGE) {
return retlang;
}
return retlang;
}
typedef struct {
const char* name;
UnicodeLScript lscript;
} NameScriptPair;
// In alphabetic order for binary search
static const NameScriptPair kNameScriptPair[] = {
// Unicode 5.1 additional scripts
{"Arab", ULScript_Arabic},
{"Armn", ULScript_Armenian},
{"Bali", ULScript_Balinese},
{"Beng", ULScript_Bengali},
{"Bugi", ULScript_Buginese},
{"Buhd", ULScript_Buhid},
{"Cans", ULScript_Canadian_Aboriginal},
{"Cari", ULScript_Carian}, // Unicode 5.1
{"Cham", ULScript_Cham}, // Unicode 5.1
{"Cher", ULScript_Cherokee},
{"Copt", ULScript_Coptic},
{"Cprt", ULScript_Cypriot},
{"Cyrl", ULScript_Cyrillic},
{"Deva", ULScript_Devanagari},
{"Dsrt", ULScript_Deseret},
{"Ethi", ULScript_Ethiopic},
{"Geor", ULScript_Georgian},
{"Glag", ULScript_Glagolitic},
{"Goth", ULScript_Gothic},
{"Grek", ULScript_Greek},
{"Gujr", ULScript_Gujarati},
{"Guru", ULScript_Gurmukhi},
{"Hani", ULScript_HanCJK},
{"Hano", ULScript_Hanunoo},
{"Hebr", ULScript_Hebrew},
{"Ital", ULScript_Old_Italic},
{"Kali", ULScript_Kayah_Li}, // Unicode 5.1
{"Khar", ULScript_Kharoshthi},
{"Khmr", ULScript_Khmer},
{"Knda", ULScript_Kannada},
{"Laoo", ULScript_Lao},
{"Latn", ULScript_Latin},
{"Lepc", ULScript_Lepcha}, // Unicode 5.1
{"Limb", ULScript_Limbu},
{"Linb", ULScript_Linear_B},
{"Lyci", ULScript_Lycian}, // Unicode 5.1
{"Lydi", ULScript_Lydian}, // Unicode 5.1
{"Mlym", ULScript_Malayalam},
{"Mong", ULScript_Mongolian},
{"Mymr", ULScript_Myanmar},
{"Nkoo", ULScript_Nko},
{"Ogam", ULScript_Ogham},
{"Olck", ULScript_Ol_Chiki}, // Unicode 5.1
{"Orya", ULScript_Oriya},
{"Osma", ULScript_Osmanya},
{"Phag", ULScript_Phags_Pa},
{"Phnx", ULScript_Phoenician},
{"Rjng", ULScript_Rejang}, // Unicode 5.1
{"Runr", ULScript_Runic},
{"Saur", ULScript_Saurashtra}, // Unicode 5.1
{"Shaw", ULScript_Shavian},
{"Sinh", ULScript_Sinhala},
{"Sund", ULScript_Sundanese}, // Unicode 5.1
{"Sylo", ULScript_Syloti_Nagri},
{"Syrc", ULScript_Syriac},
{"Tagb", ULScript_Tagbanwa},
{"Tale", ULScript_Tai_Le},
{"Talu", ULScript_New_Tai_Lue},
{"Taml", ULScript_Tamil},
{"Telu", ULScript_Telugu},
{"Tfng", ULScript_Tifinagh},
{"Tglg", ULScript_Tagalog},
{"Thaa", ULScript_Thaana},
{"Thai", ULScript_Thai},
{"Tibt", ULScript_Tibetan},
{"Ugar", ULScript_Ugaritic},
{"Vaii", ULScript_Vai}, // Unicode 5.1 // NOTE: apparently 'Vai '
{"Xpeo", ULScript_Old_Persian},
{"Xsux", ULScript_Cuneiform},
{"Yiii", ULScript_Yi},
{"Zyyy", ULScript_Common},
{"Zzzz", ULScript_Inherited},
};
// Convert "en-Latn-GB" to ULScript_Latin
UnicodeLScript GetLScriptFromNumberOrName(const char* src) {
if (strspn(src, "0123456789") == strlen(src)) {
// All digits
return static_cast<UnicodeLScript>(strto32(src, NULL, 10));
}
if (strcmp(src, "zh-TW") == 0) {return ULScript_HanCJK;}
if (strcmp(src, "zh-CN") == 0) {return ULScript_HanCJK;}
if (strcmp(src, "pt-BR") == 0) {return ULScript_Latin;}
if (strcmp(src, "pt-PT") == 0) {return ULScript_Latin;}
// Could be Latn or Limb; all our current training data is Latn
if (strcmp(src, "sit-NP") == 0) {return ULScript_Latin;}
// Isolate just the script field
char temp[5];
const char* src2 = strchr(src, '-');
if (src2 == NULL) {return ULScript_Latin;}
src2 += 1; // over the -
memcpy(temp, src2, 4);
temp[4] = '\0';
int lo = 0;
int hi = ULScript_NUM_SCRIPTS;
while (lo < hi) {
int mid = (lo + hi) >> 1;
if (strcmp(temp, kNameScriptPair[mid].name) < 0) {
hi = mid;
} else if (strcmp(temp, kNameScriptPair[mid].name) > 0) {
lo = mid + 1;
} else {
return kNameScriptPair[mid].lscript;
}
}
return ULScript_Latin;
}
// Merge together some languages, such as bo/hr/sr
// Croatian Latin and Serbian Cyrillic now.
Language NormalizeLanguage(Language lang) {
if (lang == BOSNIAN) {return CROATIAN;}
if (lang == SERBO_CROATIAN) {return SERBIAN;}
if (lang == PORTUGUESE_P) {return PORTUGUESE;}
if (lang == PORTUGUESE_B) {return PORTUGUESE;}
return lang;
}