The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* NATools - Package with parallel corpora tools
 * Copyright (C) 2002-2012  Alberto Simões
 *
 * This package is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */


#include <EXTERN.h>
#include <perl.h>
#include <stdio.h>
#include <stdlib.h>
#include <zlib.h>
#include <string.h>

#include <NATools/words.h>
#include "dictionary.h"
#include "unicode.h"
#include "natdict.h"

/**
 * @file
 * @brief Based on lexicon files and temporary dictionaries, creates a
 * NATools dictionary
 */

static nat_uint32_t put_string(wchar_t *str, wchar_t *orig,
                               nat_uint32_t ptr, nat_uint32_t size) {
    nat_uint32_t i = 0;
    while(orig[i]) {
	str[ptr++] = orig[i++];
	if (ptr == size) {
	    str = g_realloc(str, size + 32000);
	    g_message("** REALLOC **");
	}
    }
    str[ptr++] = '\0';
    return ptr;
}

static nat_uint32_t tree_to_array(nat_uint32_t nrwords, wchar_t *str, 
                                  NATCell *cells, WordLstNode *tree, nat_uint32_t ptr,
                                  nat_uint32_t size, nat_uint32_t *cellptr, nat_uint32_t *tab) {
    nat_uint32_t offset = ptr;
    if (tree->left)
	offset = tree_to_array(nrwords, str, cells, tree->left, offset, size, cellptr, tab);

    cells[*cellptr].offset = offset;
    cells[*cellptr].count = tree->count;
    cells[*cellptr].id = *cellptr;

#ifdef DEBUG
    if (tree->id >= nrwords || *cellptr >= nrwords) 
	g_message("*** Words/id/cellptr (%u,%u,%u) ***", nrwords, tree->id, *cellptr);
#endif
    tab[tree->id] = *cellptr;

    ++*cellptr;

    offset = put_string(str, tree->string, offset, size);

    if (tree->right)
	offset = tree_to_array(nrwords, str, cells, tree->right, offset, size, cellptr, tab);
    return offset;
}

static int save(char *outfile,
		char *lang1, char *lang2,
		char *dicfile1, nat_uint32_t *tab1,
		char *dicfile2, nat_uint32_t *tab2,
		wchar_t *string1, nat_uint32_t ptr1, NATCell *cells1, nat_uint32_t size1,
		wchar_t *string2, nat_uint32_t ptr2, NATCell *cells2, nat_uint32_t size2) {
    FILE *out = NULL;
    nat_int_t s;

    Dictionary *dic;

    out = gzopen(outfile, "wb");
    if (!out) return 0;

    // Say this is a NATDict
    gzprintf(out,"!NATDict");

    s = strlen(lang1)+1;
    gzwrite(out, &s, sizeof(nat_int_t));
    gzwrite(out, lang1, s);

    s = strlen(lang2)+1;
    gzwrite(out, &s, sizeof(nat_int_t));
    gzwrite(out, lang2, s);

    // Save first lexicon
    g_message("** Saving source Lexicon **");
    gzwrite(out, &ptr1, sizeof(nat_uint32_t));
    gzwrite(out, string1, ptr1);
    gzwrite(out, &size1, sizeof(nat_uint32_t));
    g_message("\tSize: %u", size1);
    gzwrite(out, cells1, sizeof(NATCell) * size1);

    // Save second lexicon
    g_message("** Saving target Lexicon **");
    gzwrite(out, &ptr2, sizeof(nat_uint32_t));
    gzwrite(out, string2, ptr2);
    gzwrite(out, &size2, sizeof(nat_uint32_t));
    g_message("\tSize: %u", size2);
    gzwrite(out, cells2, sizeof(NATCell)* size2);

    // Load first dictionary
    g_message("** Source -> Target dictionary **");
    g_message("\tLoading...");
    dic = dictionary_open(dicfile1);

    dictionary_remap(tab1, tab2, dic);

    g_message("\tSaving...");
    gzwrite(out, &dic->size, sizeof(nat_uint32_t));
    gzwrite(out, dic->pairs, sizeof(DicPair)*MAXENTRY*(dic->size+1));
    gzwrite(out, dic->occurs, sizeof(nat_uint32_t)*(dic->size+1));
    dictionary_free(dic);

    // Load second dictionary
    g_message("** Target -> Source dictionary **");
    g_message("\tLoading...");
    dic = dictionary_open(dicfile2);

    dictionary_remap(tab2, tab1, dic);

    g_message("\tSaving...");
    gzwrite(out, &dic->size, sizeof(nat_uint32_t));
    gzwrite(out, dic->pairs, sizeof(DicPair)*MAXENTRY*(dic->size+1));
    gzwrite(out, dic->occurs, sizeof(nat_uint32_t)*(dic->size+1));
    dictionary_free(dic);

    // Close the file
    g_message("** DONE **");
    gzclose(out);

    return 1;
}

static void go(char *lang1, char *lang2,
               char *lexfile1, char *dicfile1,
               char *lexfile2, char *dicfile2,
               char *outfile) {
    Words *lex1, *lex2;
    wchar_t *string1 = NULL;
    wchar_t *string2 = NULL;
    nat_uint32_t ptr1 = 0;
    nat_uint32_t ptr2 = 0;
    nat_uint32_t size1, size2;
    NATCell *cells1 = NULL;
    NATCell *cells2 = NULL;
    nat_uint32_t cellptr1 = 0;
    nat_uint32_t cellptr2 = 0;
    nat_uint32_t *tab1 = NULL;
    nat_uint32_t *tab2 = NULL;

    /* ---- First ------------------------------- */
    lex1 = words_quick_load(lexfile1);
    if (!lex1) { fprintf(stderr, "Error loading lexicon 1\n"); exit(1); }

    size1 = 11 * lex1->count;

    string1 = g_new0(wchar_t, size1);
    if (!string1) { fprintf(stderr, "Error allocating string1\n"); exit(1); }

    cells1 = g_new0(NATCell, lex1->count + 1);
    if (!cells1) { fprintf(stderr, "Error allocating cells1\n"); exit(1); }

    tab1 = g_new0(nat_uint32_t, lex1->count + 1);
    if (!tab1) { fprintf(stderr, "Error allocating tab1\n"); exit(1); }

    tab1[0] = tab1[1] = lex1->count-1;
    ptr1 = tree_to_array(lex1->count,string1, cells1, lex1->tree, ptr1, size1, &cellptr1, tab1);
    
    cells1[cellptr1].offset = ptr1;
    cells1[cellptr1].count = 0;
    cells1[cellptr1].id = cellptr1;
    cellptr1++;

    g_message("** Preparing source Lexicon **");
    g_message("\tPtr is at %u and original size was %u", ptr1, size1);
    g_message("\tOffset on the array is %u", cellptr1);
    g_message("\tNULL is pointing to %u", tab1[0]);

    /* ---- Second ------------------------------ */
    lex2 = words_quick_load(lexfile2);
    if (!lex2) report_error("Error loading lexicon 2\n");

    size2 = 11*lex2->count;

    string2 = g_new0(wchar_t, size2);
    if (!string2) report_error("Error allocating string2\n");

    cells2 = g_new0(NATCell, lex2->count+1);
    if (!cells2) report_error("Error allocating cells2\n");

    tab2 = g_new0(nat_uint32_t, lex2->count+1);
    if (!tab2) report_error("Error allocating tab2\n");

    tab2[0] = tab2[1] = lex2->count-1;
    ptr2 = tree_to_array(lex2->count,string2, cells2, lex2->tree, ptr2, size2, &cellptr2, tab2);

    cells2[cellptr2].offset = ptr2;
    cells2[cellptr2].count = 0;
    cells2[cellptr2].id = cellptr2;
    cellptr2++;

    g_message("** Preparing target Lexicon **");
    g_message("\tPtr is at %u and original size was %u", ptr2, size2);
    g_message("\tOffset on the array is %u", cellptr2);
    g_message("\tNULL is pointing to %u", tab2[0]);

    save(outfile,
	 lang1, lang2, 
	 dicfile1, tab1, dicfile2, tab2,
	 string1, ptr1, cells1, cellptr1,
	 string2, ptr2, cells2, cellptr2);   

/*    save(outfile, lang1, lang2, 
	 dicfile1, tab1, dicfile2, tab2,
	 string1, ptr1, cells1, lex1->count,
	 string2, ptr2, cells2, lex2->count);   */


    words_free(lex1);
    words_free(lex2);
}

/**
 * @brief Main Program
 *
 * @todo Document all this program
 */
int main(int argc, char *argv[]) {

    init_locale();
    
    if (argc != 8) {
	printf("mkdict <lang1> <lang2> <lex1> <dic1> <lex2> <dic2> <outfile>\n");
	return 1;
    } else {
	go(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7]);
	return 0;
    }
}