The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
// Copyright (c) 2011 Michael McCandless. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include <Python.h>

#include "encodings/compact_lang_det/compact_lang_det.h"
#include "encodings/compact_lang_det/ext_lang_enc.h"
#include "base/string_util.h"
#include "cld_encodings.h"

static PyObject *CLDError;

static bool EncodingFromName(const char *name, Encoding *answer) {
  for (int encIDX=0;encIDX<NUM_ENCODINGS;encIDX++) {
    if (!base::strcasecmp(name, cld_encoding_info[encIDX].name)) {
      *answer = cld_encoding_info[encIDX].encoding;
      return true;
    }
  }
  *answer = UNKNOWN_ENCODING;

  return false;
}

static PyObject *
detect(PyObject *self, PyObject *args, PyObject *kwArgs) {
  char *bytes;
  int numBytes;

  int isPlainText = 0;
  int pickSummaryLanguage = 0;
  int removeWeakMatches = 1;
  int includeExtendedLanguages = 1;

  // "id" boosts Indonesian;
  const char* hintTopLevelDomain = NULL;

  // ITALIAN boosts it
  const char* hintLanguageCode = NULL;

  // SJS boosts Japanese
  const char* hintEncoding = NULL;

  static const char *kwList[] = {"utf8Bytes",
                                 "isPlainText",
                                 "includeExtendedLanguages",
                                 "hintTopLevelDomain",
                                 "hintLanguageCode",
                                 "hintEncoding",
                                 "pickSummaryLanguage",
                                 "removeWeakMatches",
                                 NULL};

  if (!PyArg_ParseTupleAndKeywords(args, kwArgs, "s#|iizzzii",
                                   (char **) kwList,
                                   &bytes, &numBytes,
                                   &isPlainText,
                                   &includeExtendedLanguages,
                                   &hintTopLevelDomain,
                                   &hintLanguageCode,
                                   &hintEncoding,
                                   &pickSummaryLanguage,
                                   &removeWeakMatches)) {
    return NULL;
  }

  Language hintLanguageEnum;
  if (hintLanguageCode == NULL) {
    // no hint
    hintLanguageEnum = UNKNOWN_LANGUAGE;
  } else if (!LanguageFromCode(hintLanguageCode, &hintLanguageEnum)) {
    // TODO: maybe LookupError?
    PyErr_Format(CLDError, "Unrecognized language hint code (got '%s'); see cld.LANGUAGES for recognized language codes (note that currently external languages cannot be hinted)", hintLanguageCode);
    return NULL;
  }

  Encoding hintEncodingEnum;
  if (hintEncoding == NULL) {
    // no hint
    hintEncodingEnum = UNKNOWN_ENCODING;
  } else if (!EncodingFromName(hintEncoding, &hintEncodingEnum)) {
    PyErr_Format(CLDError, "Unrecognized encoding hint code (got '%s'); see cld.ENCODINGS for recognized encodings", hintEncoding);
    return NULL;
  }
    
  bool isReliable;
  Language language3[3];
  int percent3[3];
  double normalized_score3[3];
  int textBytesFound;
  Language sumLang;
  Py_BEGIN_ALLOW_THREADS
  sumLang = CompactLangDet::DetectLanguage(0,
                                           bytes, numBytes,
                                           isPlainText != 0,
                                           includeExtendedLanguages != 0,
                                           pickSummaryLanguage != 0,
                                           removeWeakMatches != 0,
                                           hintTopLevelDomain,
                                           hintEncodingEnum,
                                           hintLanguageEnum,
                                           language3,
                                           percent3,
                                           normalized_score3,
                                           &textBytesFound,
                                           &isReliable);
  Py_END_ALLOW_THREADS

  PyObject *details = PyList_New(0);
  for(int idx=0;idx<3;idx++) {
    Language lang = language3[idx];
    if (lang == UNKNOWN_LANGUAGE) {
      break;
    }

    PyObject *oneDetail = Py_BuildValue("(ssif)",
                                        ExtLanguageName(lang),
                                        ExtLanguageCode(lang),
                                        percent3[idx],
                                        normalized_score3[idx]);
    PyList_Append(details, oneDetail);
    Py_DECREF(oneDetail);
  }

  PyObject *result = Py_BuildValue("(ssOiO)",
                                   ExtLanguageName(sumLang),
                                   ExtLanguageCode(sumLang),
                                   isReliable ? Py_True : Py_False,
                                   textBytesFound,
                                   details);
  Py_DECREF(details);
  return result;
}

static PyMethodDef CLDMethods[] = {
  {"detect",  (PyCFunction) detect, METH_VARARGS | METH_KEYWORDS,
   "Detect language from a UTF8 string."},
  {NULL, NULL, 0, NULL}        /* Sentinel */
};

PyMODINIT_FUNC
initcld() {
  PyObject* m = Py_InitModule("cld", CLDMethods);
  if (m == NULL) {
    return;
  }

  // Set module-global ENCODINGS tuple:
  PyObject* pyEncs = PyTuple_New(NUM_ENCODINGS);
  for(int encIDX=0;encIDX<NUM_ENCODINGS;encIDX++) {
    PyTuple_SET_ITEM(pyEncs, encIDX, PyString_FromString(cld_encoding_info[encIDX].name));
  }
  // Steals ref:
  PyModule_AddObject(m, "ENCODINGS", pyEncs);

  // Set module-global LANGUAGES tuple:
  PyObject* pyLangs = PyTuple_New(NUM_LANGUAGES);
  for(int langIDX=0;langIDX<NUM_LANGUAGES;langIDX++) {
    PyObject* pyLang = Py_BuildValue("(zz)",
                                     LanguageName((Language) langIDX),
                                     LanguageCode((Language) langIDX));
    PyTuple_SET_ITEM(pyLangs, langIDX, pyLang);
  }
  // Steals ref:
  PyModule_AddObject(m, "LANGUAGES", pyLangs);

  // Set module-global EXTERNAL_LANGUAGES tuple:
  const int numExtLangs = EXT_NUM_LANGUAGES - EXT_LANGUAGE_BASE; // see ext_lang_enc.h
  PyObject* pyExtLangs = PyTuple_New(numExtLangs);
  for(int langIDX=EXT_LANGUAGE_BASE;langIDX<EXT_NUM_LANGUAGES;langIDX++) {
    PyObject* pyLang = Py_BuildValue("(zz)",
                                     ExtLanguageName((Language) langIDX),
                                     ExtLanguageCode((Language) langIDX));
    PyTuple_SET_ITEM(pyExtLangs, langIDX - EXT_LANGUAGE_BASE, pyLang);
  }
  // Steals ref:
  PyModule_AddObject(m, "EXTERNAL_LANGUAGES", pyExtLangs);

  // Set module-global DETECTED_LANGUAGES tuple:
  // MKM: NOTE I reverse engineered this list from the unit
  // test!!  It has all languages ever detected by the test

  PyObject* pyDetLangs = PyTuple_New(75);
  PyTuple_SET_ITEM(pyDetLangs, 0, PyString_FromString("AFRIKAANS"));
  PyTuple_SET_ITEM(pyDetLangs, 1, PyString_FromString("ALBANIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 2, PyString_FromString("AMHARIC"));
  PyTuple_SET_ITEM(pyDetLangs, 3, PyString_FromString("ARABIC"));
  PyTuple_SET_ITEM(pyDetLangs, 4, PyString_FromString("ARMENIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 5, PyString_FromString("AZERBAIJANI"));
  PyTuple_SET_ITEM(pyDetLangs, 6, PyString_FromString("BASQUE"));
  PyTuple_SET_ITEM(pyDetLangs, 7, PyString_FromString("BELARUSIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 8, PyString_FromString("BENGALI"));
  PyTuple_SET_ITEM(pyDetLangs, 9, PyString_FromString("BULGARIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 10, PyString_FromString("BURMESE"));
  PyTuple_SET_ITEM(pyDetLangs, 11, PyString_FromString("CATALAN"));
  PyTuple_SET_ITEM(pyDetLangs, 12, PyString_FromString("CHEROKEE"));
  PyTuple_SET_ITEM(pyDetLangs, 13, PyString_FromString("CROATIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 14, PyString_FromString("CZECH"));
  PyTuple_SET_ITEM(pyDetLangs, 15, PyString_FromString("Chinese"));
  PyTuple_SET_ITEM(pyDetLangs, 16, PyString_FromString("ChineseT"));
  PyTuple_SET_ITEM(pyDetLangs, 17, PyString_FromString("DANISH"));
  PyTuple_SET_ITEM(pyDetLangs, 18, PyString_FromString("DHIVEHI"));
  PyTuple_SET_ITEM(pyDetLangs, 19, PyString_FromString("DUTCH"));
  PyTuple_SET_ITEM(pyDetLangs, 20, PyString_FromString("ENGLISH"));
  PyTuple_SET_ITEM(pyDetLangs, 21, PyString_FromString("ESTONIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 22, PyString_FromString("FINNISH"));
  PyTuple_SET_ITEM(pyDetLangs, 23, PyString_FromString("FRENCH"));
  PyTuple_SET_ITEM(pyDetLangs, 23, PyString_FromString("GALICIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 24, PyString_FromString("GEORGIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 25, PyString_FromString("GERMAN"));
  PyTuple_SET_ITEM(pyDetLangs, 26, PyString_FromString("GREEK"));
  PyTuple_SET_ITEM(pyDetLangs, 27, PyString_FromString("GUJARATI"));
  PyTuple_SET_ITEM(pyDetLangs, 28, PyString_FromString("HAITIAN_CREOLE"));
  PyTuple_SET_ITEM(pyDetLangs, 29, PyString_FromString("HEBREW"));
  PyTuple_SET_ITEM(pyDetLangs, 30, PyString_FromString("HINDI"));
  PyTuple_SET_ITEM(pyDetLangs, 31, PyString_FromString("HUNGARIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 32, PyString_FromString("ICELANDIC"));
  PyTuple_SET_ITEM(pyDetLangs, 33, PyString_FromString("INDONESIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 34, PyString_FromString("INUKTITUT"));
  PyTuple_SET_ITEM(pyDetLangs, 35, PyString_FromString("IRISH"));
  PyTuple_SET_ITEM(pyDetLangs, 36, PyString_FromString("ITALIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 37, PyString_FromString("Japanese"));
  PyTuple_SET_ITEM(pyDetLangs, 38, PyString_FromString("KANNADA"));
  PyTuple_SET_ITEM(pyDetLangs, 39, PyString_FromString("KHMER"));
  PyTuple_SET_ITEM(pyDetLangs, 40, PyString_FromString("Korean"));
  PyTuple_SET_ITEM(pyDetLangs, 41, PyString_FromString("LAOTHIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 42, PyString_FromString("LATVIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 43, PyString_FromString("LITHUANIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 44, PyString_FromString("MACEDONIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 45, PyString_FromString("MALAY"));
  PyTuple_SET_ITEM(pyDetLangs, 46, PyString_FromString("MALAYALAM"));
  PyTuple_SET_ITEM(pyDetLangs, 47, PyString_FromString("MALTESE"));
  PyTuple_SET_ITEM(pyDetLangs, 48, PyString_FromString("NORWEGIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 49, PyString_FromString("ORIYA"));
  PyTuple_SET_ITEM(pyDetLangs, 50, PyString_FromString("PERSIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 51, PyString_FromString("POLISH"));
  PyTuple_SET_ITEM(pyDetLangs, 52, PyString_FromString("PORTUGUESE"));
  PyTuple_SET_ITEM(pyDetLangs, 53, PyString_FromString("PUNJABI"));
  PyTuple_SET_ITEM(pyDetLangs, 54, PyString_FromString("ROMANIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 55, PyString_FromString("RUSSIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 56, PyString_FromString("SERBIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 57, PyString_FromString("SINHALESE"));
  PyTuple_SET_ITEM(pyDetLangs, 58, PyString_FromString("SLOVAK"));
  PyTuple_SET_ITEM(pyDetLangs, 59, PyString_FromString("SLOVENIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 60, PyString_FromString("SPANISH"));
  PyTuple_SET_ITEM(pyDetLangs, 61, PyString_FromString("SWAHILI"));
  PyTuple_SET_ITEM(pyDetLangs, 62, PyString_FromString("SWEDISH"));
  PyTuple_SET_ITEM(pyDetLangs, 63, PyString_FromString("SYRIAC"));
  PyTuple_SET_ITEM(pyDetLangs, 64, PyString_FromString("TAGALOG"));
  PyTuple_SET_ITEM(pyDetLangs, 65, PyString_FromString("TAMIL"));
  PyTuple_SET_ITEM(pyDetLangs, 66, PyString_FromString("TELUGU"));
  PyTuple_SET_ITEM(pyDetLangs, 67, PyString_FromString("THAI"));
  PyTuple_SET_ITEM(pyDetLangs, 68, PyString_FromString("TIBETAN"));
  PyTuple_SET_ITEM(pyDetLangs, 69, PyString_FromString("TURKISH"));
  PyTuple_SET_ITEM(pyDetLangs, 70, PyString_FromString("UKRAINIAN"));
  PyTuple_SET_ITEM(pyDetLangs, 71, PyString_FromString("URDU"));
  PyTuple_SET_ITEM(pyDetLangs, 72, PyString_FromString("VIETNAMESE"));
  PyTuple_SET_ITEM(pyDetLangs, 73, PyString_FromString("WELSH"));
  PyTuple_SET_ITEM(pyDetLangs, 74, PyString_FromString("YIDDISH"));

  // Steals ref:
  PyModule_AddObject(m, "DETECTED_LANGUAGES", pyDetLangs);
  
  CLDError = PyErr_NewException((char *) "cld.error", NULL, NULL);
  // Steals ref:
  PyModule_AddObject(m, "error", CLDError);
}