/* -*- Mode: C; c-file-style: "stroustrup" -*- */
/* NATools - Package with parallel corpora tools
* Copyright (C) 2002-2012 Alberto Simões
*
* This package is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#include "ngramidx.h"
#include <sys/types.h>
#include <sys/stat.h>
#define CACHE_SIZE 100000
static int file_exists(const char* filename) {
struct stat sb;
int rc = stat(filename, &sb);
if (rc == 0) {
if (S_ISREG(sb.st_mode))
return 1;
else
return 0;
} else
return 0;
}
static void ngramidx_sqlite3_pragmas(sqlite3* dbh) {
sqlite3_exec(dbh, "PRAGMA page_size = 4096;", NULL, NULL, NULL);
sqlite3_exec(dbh, "PRAGMA temp_store = MEMORY;", NULL, NULL, NULL);
sqlite3_exec(dbh, "PRAGMA cache_size = 1000000;", NULL, NULL, NULL);
sqlite3_exec(dbh, "PRAGMA synchronous = OFF;", NULL, NULL, NULL);
sqlite3_exec(dbh, "PRAGMA count_changes = 0;", NULL, NULL, NULL);
}
/**
* @brief Create a new SQLite database for ngrams
*
* This function checks if a SQLite database exists. If it does, it is
* opened. If not it is created.
*
* @param filename name for the SQLite file
* @param n number of ngrams of the database (2,3,4 or -1 for all)
* @return the new SQLite object
*/
SQLite* ngram_index_new(const char* filename, int n) {
SQLite* res;
char *errmsg = NULL;
int rc;
if (n!=-1 && n!=2 && n!=3 && n!=4) return NULL;
res = (SQLite *) malloc(sizeof(SQLite));
res -> n = n;
if (file_exists(filename)) {
rc = sqlite3_open(filename, &(res->dbh));
if( rc ){
fprintf(stderr, "Can't open database: %s\n", sqlite3_errmsg(res->dbh));
sqlite3_close(res->dbh);
free(res);
exit(1);
}
ngramidx_sqlite3_pragmas(res->dbh);
} else {
rc = sqlite3_open(filename, &(res->dbh));
if( rc ){
fprintf(stderr, "Can't open database: %s\n", sqlite3_errmsg(res->dbh));
sqlite3_close(res->dbh);
free(res);
exit(1);
}
ngramidx_sqlite3_pragmas(res->dbh);
if (n == 2 || n == -1) {
rc = sqlite3_exec(
res->dbh,
"CREATE TABLE bigrams (word1 INTEGER, word2 INTEGER, "
"occs INTEGER, PRIMARY KEY (word1, word2))",
NULL, NULL, &errmsg);
if (rc != SQLITE_OK) {
fprintf(stderr, "Error creating table: %s\n", errmsg);
sqlite3_free(errmsg);
sqlite3_close(res->dbh);
free(res);
exit(1);
}
}
if (n == 3 || n == -1) {
rc = sqlite3_exec(
res->dbh,
"CREATE TABLE trigrams (word1 INTEGER, word2 INTEGER, "
"word3 INTEGER, "
"occs INTEGER, PRIMARY KEY (word1, word2, word3))",
NULL, NULL, &errmsg);
if (rc != SQLITE_OK) {
fprintf(stderr, "Error creating table: %s\n", errmsg);
sqlite3_free(errmsg);
sqlite3_close(res->dbh);
free(res);
exit(1);
}
}
if (n == 4 || n == -1) {
rc = sqlite3_exec(
res->dbh,
"CREATE TABLE tetragrams (word1 INTEGER, word2 INTEGER, "
"word3 INTEGER, word4 INTEGER, "
"occs INTEGER, PRIMARY KEY (word1, word2, word3, word4))",
NULL, NULL, &errmsg);
if (rc != SQLITE_OK) {
fprintf(stderr, "Error creating table: %s\n", errmsg);
sqlite3_free(errmsg);
sqlite3_close(res->dbh);
free(res);
exit(1);
}
}
}
sqlite3_exec(res->dbh, "BEGIN", NULL, NULL, NULL);
/* Let's initialize our beloved cache :D */
if (n==2 || n==-1)
res->bigram_cache = g_hash_table_new(g_str_hash, g_str_equal);
if (n==3 || n==-1)
res->trigram_cache = g_hash_table_new(g_str_hash, g_str_equal);
if (n==4 || n==-1)
res->tetragram_cache = g_hash_table_new(g_str_hash, g_str_equal);
return res;
}
SQLite* ngram_index_open_and_attach(const char* template) {
SQLite* db = NULL;
char *temp_file = NULL;
char *temp_command = NULL;
int n;
for (n=2; n<=4; ++n) {
temp_file = g_strdup_printf(template, n);
if (!db) {
db = ngram_index_open(temp_file, n);
g_free(temp_file);
if (!db) return NULL;
} else {
if (n==3) {
temp_command = g_strdup_printf("ATTACH \"%s\" as trigrams;", temp_file);
} else if (n==4) {
temp_command = g_strdup_printf("ATTACH \"%s\" as tetragrams;", temp_file);
}
sqlite3_exec(db->dbh, temp_command, NULL, NULL, NULL);
g_free(temp_command);
g_free(temp_file);
}
}
db -> n = -1;
return db;
}
SQLite* ngram_index_open(const char* filename, int n) {
SQLite *res;
int rc;
if (n!=-1 && n!=2 && n!=3 && n!=4) return NULL;
res = (SQLite*) malloc(sizeof(SQLite));
res -> n = n;
rc = sqlite3_open(filename, &(res->dbh));
if( rc ){
fprintf(stderr, "Can't open database: %s\n", sqlite3_errmsg(res->dbh));
sqlite3_close(res->dbh);
free(res);
return NULL;
}
ngramidx_sqlite3_pragmas(res->dbh);
return res;
}
void ngram_index_close(SQLite *sqstruct) {
sqlite3 *db = sqstruct->dbh;
int n = sqstruct->n;
sqlite3_exec(db, "BEGIN", NULL, NULL, NULL);
/* Dump our final cache */
if (n==-1 || n==2) {
g_hash_table_foreach_steal(sqstruct->bigram_cache,
bigram_free_cache,
(gpointer) db);
g_hash_table_destroy(sqstruct->bigram_cache);
}
if (n==-1 || n==3) {
g_hash_table_foreach_steal(sqstruct->trigram_cache,
trigram_free_cache,
(gpointer) db);
g_hash_table_destroy(sqstruct->trigram_cache);
}
if (n==-1 || n==4) {
g_hash_table_foreach_steal(sqstruct->tetragram_cache,
tetragram_free_cache,
(gpointer) db);
g_hash_table_destroy(sqstruct->tetragram_cache);
}
/* COMMIT COMMIT!! */
sqlite3_exec(db, "END", NULL, NULL, NULL);
sqlite3_close(db);
free(sqstruct);
}
static int set_exists(void *exists, int argc, char **argv, char **azColName) {
*((nat_uint32_t*)exists) = (nat_uint32_t)g_ascii_strtoull(argv[0], NULL, 10);
return 0;
}
void bigram_add_occurrence(SQLite* sqstruct, nat_uint32_t w1, nat_uint32_t w2) {
sqlite3 *db = sqstruct->dbh;
nat_uint32_t * counter;
char *token = NULL;
if (sqstruct->n != -1 && sqstruct->n != 2) return;
/* Use our beloved cache */
token = g_strdup_printf("%u|%u", w1, w2);
counter = (nat_uint32_t *) g_hash_table_lookup(sqstruct->bigram_cache, token);
if(counter)
(*counter)++;
else {
counter = (nat_uint32_t*) g_malloc(sizeof(nat_uint32_t));
*counter = 1;
}
g_hash_table_insert(sqstruct->bigram_cache, token, counter);
if(g_hash_table_size(sqstruct->bigram_cache) > CACHE_SIZE) {
g_hash_table_foreach_steal(sqstruct->bigram_cache, bigram_free_cache, (gpointer) db);
}
/* END cache */
}
void trigram_add_occurrence(SQLite* sqstruct, nat_uint32_t w1, nat_uint32_t w2, nat_uint32_t w3) {
sqlite3 *db = sqstruct->dbh;
nat_uint32_t * counter;
char *token = NULL;
if (sqstruct->n != -1 && sqstruct->n != 3) return;
/* Use our beloved cache */
token = g_strdup_printf("%u|%u|%u", w1, w2, w3);
counter = (nat_uint32_t *) g_hash_table_lookup(sqstruct->trigram_cache, token);
if(counter)
(*counter)++;
else {
counter = (nat_uint32_t*) g_malloc(sizeof(nat_uint32_t));
*counter = 1;
}
g_hash_table_insert(sqstruct->trigram_cache, token, counter);
if(g_hash_table_size(sqstruct->trigram_cache) > CACHE_SIZE) {
g_hash_table_foreach_steal(sqstruct->trigram_cache, trigram_free_cache, (gpointer) db);
}
/* END cache */
}
void tetragram_add_occurrence(SQLite* sqstruct, nat_uint32_t w1, nat_uint32_t w2, nat_uint32_t w3, nat_uint32_t w4) {
sqlite3 *db = sqstruct->dbh;
nat_uint32_t * counter;
char *token = NULL;
if (sqstruct->n != -1 && sqstruct->n != 4) return;
/* Use our beloved cache */
token = g_strdup_printf("%u|%u|%u|%u", w1, w2, w3, w4);
counter = (nat_uint32_t *) g_hash_table_lookup(sqstruct->tetragram_cache, token);
if(counter)
(*counter)++;
else {
counter = (nat_uint32_t*) g_malloc(sizeof(nat_uint32_t));
*counter = 1;
}
g_hash_table_insert(sqstruct->tetragram_cache, token, counter);
if(g_hash_table_size(sqstruct->tetragram_cache) > CACHE_SIZE) {
g_hash_table_foreach_steal(sqstruct->tetragram_cache, tetragram_free_cache, (gpointer) db);
}
/* END cache */
}
gboolean bigram_free_cache(gpointer key, gpointer value, gpointer user_data) {
int rc;
char * query = NULL;
char * skey = (char *) key;
nat_uint32_t * nvalue = (nat_uint32_t *) value;
char *errmsg = NULL;
nat_uint32_t exists = 0;
nat_uint32_t w1, w2;
sqlite3 *db = (sqlite3 *) user_data;
sscanf(skey, "%u|%u", &w1, &w2);
query = g_strdup_printf("SELECT occs FROM bigrams WHERE word1=%u AND word2=%u",
w1, w2);
rc = sqlite3_exec(db, query, set_exists, &exists, &errmsg);
g_free(query);
if (rc != SQLITE_OK) {
fprintf(stderr, "Error searching for bigram: %s\n", errmsg);
sqlite3_free(errmsg);
sqlite3_close(db);
exit(1);
}
if (exists) {
query = g_strdup_printf("UPDATE bigrams SET occs = %u WHERE word1=%u AND word2=%u",
exists + (*nvalue), w1, w2);
} else {
query = g_strdup_printf("INSERT INTO bigrams VALUES(%u,%u,%u)",
w1, w2, *nvalue);
}
rc = sqlite3_exec(db, query, NULL, NULL, &errmsg);
g_free(query);
if (rc != SQLITE_OK) {
fprintf(stderr, "Error inserting/updating bigram: %s\n", errmsg);
sqlite3_free(errmsg);
sqlite3_close(db);
exit(1);
}
g_free(skey);
g_free(nvalue);
return TRUE;
}
gboolean trigram_free_cache(gpointer key, gpointer value, gpointer user_data) {
int rc;
char * query = NULL;
char * skey = (char *) key;
nat_uint32_t * nvalue = (nat_uint32_t *) value;
char *errmsg = NULL;
nat_uint32_t exists = 0;
nat_uint32_t w1, w2, w3;
sqlite3 *db = (sqlite3 *) user_data;
sscanf(skey, "%u|%u|%u", &w1, &w2, &w3);
query = g_strdup_printf("SELECT occs FROM trigrams WHERE word1=%u AND word2=%u AND word3=%u",
w1, w2, w3);
rc = sqlite3_exec(db, query, set_exists, &exists, &errmsg);
g_free(query);
if (rc != SQLITE_OK) {
fprintf(stderr, "Error searching for trigram: %s\n", errmsg);
sqlite3_free(errmsg);
sqlite3_close(db);
exit(1);
}
if (exists) {
query = g_strdup_printf("UPDATE trigrams SET occs = %u WHERE word1=%u AND word2=%u AND word3=%u",
exists + (*nvalue), w1, w2, w3);
} else {
query = g_strdup_printf("INSERT INTO trigrams VALUES(%u,%u,%u,%u)",
w1, w2, w3, *nvalue);
}
rc = sqlite3_exec(db, query, NULL, NULL, &errmsg);
g_free(query);
if (rc != SQLITE_OK) {
fprintf(stderr, "Error inserting/updating trigram: %s\n", errmsg);
sqlite3_free(errmsg);
sqlite3_close(db);
exit(1);
}
g_free(skey);
g_free(nvalue);
return TRUE;
}
gboolean tetragram_free_cache(gpointer key, gpointer value, gpointer user_data) {
int rc;
char * query = NULL;
char * skey = (char *) key;
nat_uint32_t * nvalue = (nat_uint32_t *) value;
char *errmsg = NULL;
nat_uint32_t exists = 0;
nat_uint32_t w1, w2, w3, w4;
sqlite3 *db = (sqlite3 *) user_data;
sscanf(skey, "%u|%u|%u|%u", &w1, &w2, &w3, &w4);
query = g_strdup_printf("SELECT occs FROM tetragrams WHERE "
"word1=%u AND word2=%u AND word3=%u AND word4=%u",
w1, w2, w3, w4);
rc = sqlite3_exec(db, query, set_exists, &exists, &errmsg);
g_free(query);
if (rc != SQLITE_OK) {
fprintf(stderr, "Error searching for tetragram: %s\n", errmsg);
sqlite3_free(errmsg);
sqlite3_close(db);
exit(1);
}
if (exists) {
query = g_strdup_printf("UPDATE tetragrams SET occs = %u WHERE "
"word1=%u AND word2=%u AND word3=%u AND word4=%u",
exists + (*nvalue), w1, w2, w3, w4);
} else {
query = g_strdup_printf("INSERT INTO tetragrams VALUES(%u,%u,%u,%u,%u)",
w1, w2, w3, w4, *nvalue);
}
rc = sqlite3_exec(db, query, NULL, NULL, &errmsg);
g_free(query);
if (rc != SQLITE_OK) {
fprintf(stderr, "Error inserting/updating tetragram: %s\n", errmsg);
sqlite3_free(errmsg);
sqlite3_close(db);
exit(1);
}
g_free(skey);
g_free(nvalue);
return TRUE;
}