The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* -*- Mode: C; c-file-style: "stroustrup" -*- */

/* NATools - Package with parallel corpora tools
 * Copyright (C) 1998-2001  Djoerd Hiemstra
 * Copyright (C) 2002-2012  Alberto Simões
 *
 * This package is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */

#include <stdio.h>
#include <string.h>

#include "standard.h"
#include <NATools/corpus.h>
#include "matrix.h"


/**
 * @file
 * @brief Allocated the sparse matrix with words co-occurrences
 */


/* #define SAVE_DOTS 1 */

static nat_boolean_t load_exc_words(nat_uint32_t nr, char *buffer, char* file)
{
    FILE *fd;
    nat_uint32_t id;
    fd = fopen(file, "rb");
    if (!fd) return FALSE;

    do {
	if (fread(&id, sizeof(nat_uint32_t), 1, fd)) {
	    if (id >= nr) return FALSE;
	    buffer[id] = 1;
	}
    } while(!feof(fd));

    return TRUE;
}

static Matrix* InitialEstimate(nat_boolean_t quiet,
                               nat_uint32_t Nrow, nat_uint32_t Ncolumn, 
			       Corpus *corpus1, Corpus *corpus2,
			       char *excWrds1,	char *excWrds2)
{ 

#ifdef SAVE_DOTS
    FILE *dots_fd;
#endif

    Matrix *matrix;
    unsigned long cSentence, nSentences;
    unsigned long r, c, l;
    int jjdoneR, jjdoneC;
    CorpusCell *s1, *s2, *sen2;

    if (!quiet)
        fprintf(stderr, "\nAllocating the sparse matrix (%d x %d):      ",
                Nrow, Ncolumn);

    /* Alloc matrix */
    matrix = AllocMatrix(Nrow, Ncolumn);
    if (!matrix) report_error("InitialEstimate: AllocMatrix failed");

    /* prepare variables for percent counting */
    nSentences = corpus_sentences_nr(corpus1);
    cSentence = 0;

    s1 = corpus_first_sentence(corpus1);
    s2 = sen2 = corpus_first_sentence(corpus2);

#ifdef SAVE_DOTS
    dots_fd = fopen("__dots__", "w");
    if (!dots_fd) report_error("cannot open __dots__ file");
#endif

    while (s1 != NULL && s2 != NULL) {
	/* print percentage information */
        if (!quiet)
            fprintf(stderr, "\b\b\b\b\b%4.1f%%",
                    (float) (cSentence++) * 99.9f / (float) nSentences);

	l = max(corpus_sentence_length(s1),
		corpus_sentence_length(s2));
	if (l <= MAXLEN) {
            jjdoneR = 0;
	    for(r = 1; r <= l && !jjdoneR ; r++) {
		if (!(excWrds1 && s1->word && excWrds1[s1->word])) {
                    jjdoneC = 0;
		    for(c = 1; c <= l && !jjdoneC ; c++) {
			if (excWrds2 && s2->word && excWrds2[s2->word]) {
			    ++s2;
			} else {
			    if (s1->word && s2->word) {
				if (IncValue(matrix, MATRIX_1, 1.0f / (float)l, s1->word, s2->word))
				    report_error("InitialEstimate: IncValue failed");
#ifdef SAVE_DOTS
				fprintf(dots_fd, "%d %d\n", s1->word, s2->word);
#endif
				++s2;
			    }
			    else {
				if (s1->word == 0) {
				    if (IncValue(matrix, MATRIX_1, 1.0f / (float)l, NULLWORD, s2->word))
					report_error("InitialEstimate: IncValue failed");
#ifdef SAVE_DOTS
				fprintf(dots_fd, "0 %d\n", s2->word);
#endif
				    ++s2;
                                    jjdoneR=1;
				}
				else {
				    if (IncValue(matrix, MATRIX_1, 1.0f / (float)l, s1->word, NULLWORD))
					report_error("InitialEstimate: IncValue failed");
#ifdef SAVE_DOTS
				fprintf(dots_fd, "%d 0\n", s1->word);
#endif
                                    jjdoneC=1;
                                }
			    }
			}
		    }
		}
		if (s1->word) s1++;
		s2 = sen2;
	    }
	}
	s1 = corpus_next_sentence(corpus1);
	s2 = sen2 = corpus_next_sentence(corpus2);
    }
    
#ifdef SAVE_DOTS
    fclose(dots_fd);
#endif

    if (s1 != NULL || s2 != NULL)
	report_error("InitialEstimate: failed to evaluate all sentences");

    if (!quiet) fprintf(stderr, "\b\b\b\b\b\b done \n");

    return matrix;
}

void show_help () {
    printf("Usage:\n"
           "  nat-initmat [-q] corpusFile1 corpusFile2 matFile\n"
           "  nat-initmat [-q] corpusFile1 corpusFile2 excludeWrds1 excludeWrds2 matFile\n");
    printf("Supported options:\n"
           "  -h shows this help message and exits\n"
           "  -V shows "PACKAGE" version and exits\n"
           "  -q activates quiet mode\n"
           "Check nat-initmat manpage for details.\n");
}



/**
 * @brief The main function 
 *
 * @todo Document this
 */
int main(int argc, char **argv)
{
    char *excWrds1 = NULL;
    char *excWrds2 = NULL;
    char *matFile;
    Corpus *corpus1, *corpus2;
    Matrix *matrix;
    nat_uint32_t total1, total2;
    nat_boolean_t quiet = FALSE;

    // extern char *optarg;
    extern int optind;
    int c;
    
    while ((c = getopt(argc, argv, "hqV")) != EOF) {
        switch (c) {
        case 'h':
            show_help();
            return 0;
        case 'V':
            printf(PACKAGE " version " VERSION "\n");
            return 0;
        case 'q':
            quiet = TRUE;
            break;
        default:
            show_help();
            return 1;
        }
    }
    
    if (argc != optind + 3 && argc != optind + 5) {
	printf("nat-initmat: wrong number of arguments\n");
        show_help();
        return 1;
    }
    
    corpus1 = corpus_new();
    corpus2 = corpus_new();

    corpus_load(corpus1, argv[optind + 0]);
    corpus_load(corpus2, argv[optind + 1]);

    if (corpus_sentences_nr(corpus1) != corpus_sentences_nr(corpus2))
	report_error("initmat.c: lengths do not match");

    /* total1 and total2 are number of words (??) */
    total1 = corpus_diff_words_nr(corpus1);
    total2 = corpus_diff_words_nr(corpus2);

    if (argc == 6) {
	excWrds1 = g_new0(char, total1 + 1);
	if (!load_exc_words(total1, excWrds1, argv[optind + 2]))
	    report_error("initmat.c: error loading excludeWrds1");

	excWrds2 = g_new0(char, total2 + 1);
	if (!load_exc_words(total2, excWrds2, argv[optind + 3])) 
	    report_error("initmat.c: error loading excludeWrds2");

	matFile = argv[optind + 4];
    } else {
	matFile = argv[optind + 2];
    }

    matrix = InitialEstimate(quiet, total1, total2, corpus1, corpus2, excWrds1, excWrds2);
    
    if (argc == optind + 5) {
	g_free(excWrds1);
	g_free(excWrds2);
    }

    if (SaveMatrix(matrix, matFile)) report_error("SaveMatrix");

    /* fprintf(stderr, 
       "Matrix total after initial estimate:%9.2f\n", MatrixTotal(matrix, Matrix1)); */

    if (!quiet) {
        fprintf(stderr, "Memory used:%10.1f kb\n\n", (float) BytesInUse(matrix) / 1024.0f);
    }

    corpus_free(corpus1);
    corpus_free(corpus2);
    FreeMatrix(matrix);

    return 0;
}