/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
/* ====================================================================
 * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer. 
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * This work was supported in part by funding from the Defense Advanced 
 * Research Projects Agency and the National Science Foundation of the 
 * United States of America, and the CMU Sphinx Speech Consortium.
 *
 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * ====================================================================
 *
 */
/*
 * \file ngram_model_arpa.c ARPA format language models
 *
 * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
 */

#include "sphinxbase/ckd_alloc.h"
#include <string.h>
#include <limits.h>
#include <assert.h>

#include "sphinxbase/err.h"
#include "sphinxbase/pio.h"
#include "sphinxbase/listelem_alloc.h"
#include "sphinxbase/strfuncs.h"

#include "ngram_model_arpa.h"

static ngram_funcs_t ngram_model_arpa_funcs;

#define TSEG_BASE(m,b)		((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
#define FIRST_BG(m,u)		((m)->lm3g.unigrams[u].bigrams)
#define FIRST_TG(m,b)		(TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))

/*
 * Read and return #unigrams, #bigrams, #trigrams as stated in input file.
 */
static int
ReadNgramCounts(lineiter_t **li, int32 * n_ug, int32 * n_bg, int32 * n_tg)
{
    int32 ngram, ngram_cnt;

    /* skip file until past the '\data\' marker */
    while (*li) {
        string_trim((*li)->buf, STRING_BOTH);
        if (strcmp((*li)->buf, "\\data\\") == 0)
            break;
        *li = lineiter_next(*li);
    }
    if (*li == NULL || strcmp((*li)->buf, "\\data\\") != 0) {
        E_INFO("No \\data\\ mark in LM file\n");
        return -1;
    }

    *n_ug = *n_bg = *n_tg = 0;
    while ((*li = lineiter_next(*li))) {
        if (sscanf((*li)->buf, "ngram %d=%d", &ngram, &ngram_cnt) != 2)
            break;
        switch (ngram) {
        case 1:
            *n_ug = ngram_cnt;
            break;
        case 2:
            *n_bg = ngram_cnt;
            break;
        case 3:
            *n_tg = ngram_cnt;
            break;
        default:
            E_ERROR("Unknown ngram (%d)\n", ngram);
            return -1;
        }
    }
    if (*li == NULL) {
        E_ERROR("EOF while reading ngram counts\n");
        return -1;
    }

    /* Position iterator to the unigrams header '\1-grams:\' */
    while ((*li = lineiter_next(*li))) {
        string_trim((*li)->buf, STRING_BOTH);
        if (strcmp((*li)->buf, "\\1-grams:") == 0)
            break;
    }
    if (*li == NULL) {
        E_ERROR_SYSTEM("Failed to read \\1-grams: mark");
        return -1;
    }

    if ((*n_ug <= 0) || (*n_bg < 0) || (*n_tg < 0)) {
        E_ERROR("Bad or missing ngram count\n");
        return -1;
    }
    return 0;
}

/*
 * Read in the unigrams from given file into the LM structure model.
 * On entry to this procedure, the iterator is positioned to the
 * header line '\1-grams:'.
 */
static int
ReadUnigrams(lineiter_t **li, ngram_model_arpa_t * model)
{
    ngram_model_t *base = &model->base;
    int32 wcnt;
    float p1;

    E_INFO("Reading unigrams\n");

    wcnt = 0;
    while ((*li = lineiter_next(*li))) {
        char *wptr[3], *name;
        float32 bo_wt = 0.0f;
        int n;

        string_trim((*li)->buf, STRING_BOTH);
        if (strcmp((*li)->buf, "\\2-grams:") == 0
            || strcmp((*li)->buf, "\\end\\") == 0)
            break;

        if ((n = str2words((*li)->buf, wptr, 3)) < 2) {
            if ((*li)->buf[0] != '\0')
                E_WARN("Format error; unigram ignored: %s\n", (*li)->buf);
            continue;
        }
        else {
            p1 = (float)atof_c(wptr[0]);
            name = wptr[1];
            if (n == 3)
                bo_wt = (float)atof_c(wptr[2]);
        }

        if (wcnt >= base->n_counts[0]) {
            E_ERROR("Too many unigrams\n");
            return -1;
        }

        /* Associate name with word id */
        base->word_str[wcnt] = ckd_salloc(name);
        if ((hash_table_enter(base->wid, base->word_str[wcnt], (void *)(long)wcnt))
            != (void *)(long)wcnt) {
                E_WARN("Duplicate word in dictionary: %s\n", base->word_str[wcnt]);
        }
        model->lm3g.unigrams[wcnt].prob1.l = logmath_log10_to_log(base->lmath, p1);
        model->lm3g.unigrams[wcnt].bo_wt1.l = logmath_log10_to_log(base->lmath, bo_wt);
        wcnt++;
    }

    if (base->n_counts[0] != wcnt) {
        E_WARN("lm_t.ucount(%d) != #unigrams read(%d)\n",
               base->n_counts[0], wcnt);
        base->n_counts[0] = wcnt;
        base->n_words = wcnt;
    }
    return 0;
}

/*
 * Read bigrams from given file into given model structure.
 */
static int
ReadBigrams(lineiter_t **li, ngram_model_arpa_t * model)
{
    ngram_model_t *base = &model->base;
    int32 w1, w2, prev_w1, bgcount;
    bigram_t *bgptr;

    E_INFO("Reading bigrams\n");

    bgcount = 0;
    bgptr = model->lm3g.bigrams;
    prev_w1 = -1;

    while ((*li = lineiter_next(*li))) {
        float32 p, bo_wt = 0.0f;
        int32 p2, bo_wt2;
        char *wptr[4], *word1, *word2;
        int n;

        string_trim((*li)->buf, STRING_BOTH);
        wptr[3] = NULL;
        if ((n = str2words((*li)->buf, wptr, 4)) < 3) {
            if ((*li)->buf[0] != '\0')
                break;
            continue;
        }
        else {
            p = (float32)atof_c(wptr[0]);
            word1 = wptr[1];
            word2 = wptr[2];
            if (wptr[3])
                bo_wt = (float32)atof_c(wptr[3]);
        }

        if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) {
            E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n",
                    word1, word1, word2);
            continue;
        }
        if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) {
            E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n",
                    word2, word1, word2);
            continue;
        }

        /* FIXME: Should use logmath_t quantization here. */
        /* HACK!! to quantize probs to 4 decimal digits */
        p = (float32)((int32)(p * 10000)) / 10000;
        bo_wt = (float32)((int32)(bo_wt * 10000)) / 10000;

        p2 = logmath_log10_to_log(base->lmath, p);
        bo_wt2 = logmath_log10_to_log(base->lmath, bo_wt);

        if (bgcount >= base->n_counts[1]) {
            E_ERROR("Too many bigrams\n");
            return -1;
        }

        bgptr->wid = w2;
        bgptr->prob2 = sorted_id(&model->sorted_prob2, &p2);
        if (base->n_counts[2] > 0)
            bgptr->bo_wt2 = sorted_id(&model->sorted_bo_wt2, &bo_wt2);

        if (w1 != prev_w1) {
            if (w1 < prev_w1) {
                E_ERROR("Bigram %s %s not in unigram order word id: %d prev word id: %d\n", word1, word2, w1, prev_w1);
                return -1;
            }

            for (prev_w1++; prev_w1 <= w1; prev_w1++)
                model->lm3g.unigrams[prev_w1].bigrams = bgcount;
            prev_w1 = w1;
        }
        bgcount++;
        bgptr++;

        if ((bgcount & 0x0000ffff) == 0) {
            E_INFOCONT(".");
        }
    }
    if (*li == NULL || ((strcmp((*li)->buf, "\\end\\") != 0)
                        && (strcmp((*li)->buf, "\\3-grams:") != 0))) {
        E_ERROR("Bad bigram: %s\n", (*li)->buf);
        return -1;
    }

    for (prev_w1++; prev_w1 <= base->n_counts[0]; prev_w1++)
        model->lm3g.unigrams[prev_w1].bigrams = bgcount;

    return 0;
}

/*
 * Very similar to ReadBigrams.
 */
static int
ReadTrigrams(lineiter_t **li, ngram_model_arpa_t * model)
{
    ngram_model_t *base = &model->base;
    int32 i, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg;
    int32 seg, prev_seg, prev_seg_lastbg;
    trigram_t *tgptr;
    bigram_t *bgptr;

    E_INFO("Reading trigrams\n");

    tgcount = 0;
    tgptr = model->lm3g.trigrams;
    prev_w1 = -1;
    prev_w2 = -1;
    prev_bg = -1;
    prev_seg = -1;

    while ((*li = lineiter_next(*li))) {
        float32 p;
        int32 p3;
        char *wptr[4], *word1, *word2, *word3;

        string_trim((*li)->buf, STRING_BOTH);
        if (str2words((*li)->buf, wptr, 4) != 4) {
            if ((*li)->buf[0] != '\0')
                break;
            continue;
        }
        else {
            p = (float32)atof_c(wptr[0]);
            word1 = wptr[1];
            word2 = wptr[2];
            word3 = wptr[3];
        }

        if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) {
            E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
                    word1, word1, word2, word3);
            continue;
        }
        if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) {
            E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
                    word2, word1, word2, word3);
            continue;
        }
        if ((w3 = ngram_wid(base, word3)) == NGRAM_INVALID_WID) {
            E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
                    word3, word1, word2, word3);
            continue;
        }

        /* FIXME: Should use logmath_t quantization here. */
        /* HACK!! to quantize probs to 4 decimal digits */
        p = (float32)((int32)(p * 10000)) / 10000;
        p3 = logmath_log10_to_log(base->lmath, p);

        if (tgcount >= base->n_counts[2]) {
            E_ERROR("Too many trigrams\n");
            return -1;
        }

        tgptr->wid = w3;
        tgptr->prob3 = sorted_id(&model->sorted_prob3, &p3);

        if ((w1 != prev_w1) || (w2 != prev_w2)) {
            /* Trigram for a new bigram; update tg info for all previous bigrams */
            if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2))) {
                E_ERROR("Trigrams not in bigram order\n");
                return -1;
            }

            bg = (w1 !=
                  prev_w1) ? model->lm3g.unigrams[w1].bigrams : prev_bg + 1;
            endbg = model->lm3g.unigrams[w1 + 1].bigrams;
            bgptr = model->lm3g.bigrams + bg;
            for (; (bg < endbg) && (bgptr->wid != w2); bg++, bgptr++);
            if (bg >= endbg) {
                E_ERROR("Missing bigram for trigram: %s", (*li)->buf);
                return -1;
            }

            /* bg = bigram entry index for <w1,w2>.  Update tseg_base */
            seg = bg >> LOG_BG_SEG_SZ;
            for (i = prev_seg + 1; i <= seg; i++)
                model->lm3g.tseg_base[i] = tgcount;

            /* Update trigrams pointers for all bigrams until bg */
            if (prev_seg < seg) {
                int32 tgoff = 0;

                if (prev_seg >= 0) {
                    tgoff = tgcount - model->lm3g.tseg_base[prev_seg];
                    if (tgoff > 65535) {
                        E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
                        return -1;
                    }
                }

                prev_seg_lastbg = ((prev_seg + 1) << LOG_BG_SEG_SZ) - 1;
                bgptr = model->lm3g.bigrams + prev_bg;
                for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg;
                     prev_bg++, bgptr++)
                    bgptr->trigrams = tgoff;

                for (; prev_bg <= bg; prev_bg++, bgptr++)
                    bgptr->trigrams = 0;
            }
            else {
                int32 tgoff;

                tgoff = tgcount - model->lm3g.tseg_base[prev_seg];
                if (tgoff > 65535) {
                    E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
                    return -1;
                }

                bgptr = model->lm3g.bigrams + prev_bg;
                for (++prev_bg, ++bgptr; prev_bg <= bg; prev_bg++, bgptr++)
                    bgptr->trigrams = tgoff;
            }

            prev_w1 = w1;
            prev_w2 = w2;
            prev_bg = bg;
            prev_seg = seg;
        }

        tgcount++;
        tgptr++;

        if ((tgcount & 0x0000ffff) == 0) {
            E_INFOCONT(".");
        }
    }
    if (*li == NULL || strcmp((*li)->buf, "\\end\\") != 0) {
        E_ERROR("Bad trigram: %s\n", (*li)->buf);
        return -1;
    }

    for (prev_bg++; prev_bg <= base->n_counts[1]; prev_bg++) {
        if ((prev_bg & (BG_SEG_SZ - 1)) == 0)
            model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ] = tgcount;
        if ((tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ]) > 65535) {
            E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
            return -1;
        }
        model->lm3g.bigrams[prev_bg].trigrams =
            tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ];
    }
    return 0;
}

static unigram_t *
new_unigram_table(int32 n_ug)
{
    unigram_t *table;
    int32 i;

    table = ckd_calloc(n_ug, sizeof(unigram_t));
    for (i = 0; i < n_ug; i++) {
        table[i].prob1.l = INT_MIN;
        table[i].bo_wt1.l = INT_MIN;
    }
    return table;
}

ngram_model_t *
ngram_model_arpa_read(cmd_ln_t *config,
		      const char *file_name,
		      logmath_t *lmath)
{
    lineiter_t *li;
    FILE *fp;
    int32 is_pipe;
    int32 n_unigram;
    int32 n_bigram;
    int32 n_trigram;
    int32 n;
    ngram_model_arpa_t *model;
    ngram_model_t *base;

    if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
        E_ERROR("File %s not found\n", file_name);
        return NULL;
    }
    li = lineiter_start(fp);
 
    /* Read #unigrams, #bigrams, #trigrams from file */
    if (ReadNgramCounts(&li, &n_unigram, &n_bigram, &n_trigram) == -1) {
        lineiter_free(li);
        fclose_comp(fp, is_pipe);
        return NULL;
    }
    E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);

    /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
    model = ckd_calloc(1, sizeof(*model));
    base = &model->base;
    if (n_trigram > 0)
        n = 3;
    else if (n_bigram > 0)
        n = 2;
    else
        n = 1;
    /* Initialize base model. */
    ngram_model_init(base, &ngram_model_arpa_funcs, lmath, n, n_unigram);
    base->n_counts[0] = n_unigram;
    base->n_counts[1] = n_bigram;
    base->n_counts[2] = n_trigram;
    base->writable = TRUE;

    /*
     * Allocate one extra unigram and bigram entry: sentinels to terminate
     * followers (bigrams and trigrams, respectively) of previous entry.
     */
    model->lm3g.unigrams = new_unigram_table(n_unigram + 1);
    model->lm3g.bigrams =
        ckd_calloc(n_bigram + 1, sizeof(bigram_t));
    if (n_trigram > 0)
        model->lm3g.trigrams =
            ckd_calloc(n_trigram, sizeof(trigram_t));

    if (n_trigram > 0) {
        model->lm3g.tseg_base =
            ckd_calloc((n_bigram + 1) / BG_SEG_SZ + 1,
                       sizeof(int32));
    }
    if (ReadUnigrams(&li, model) == -1) {
        fclose_comp(fp, is_pipe);
        ngram_model_free(base);
        return NULL;
    }
    E_INFO("%8d = #unigrams created\n", base->n_counts[0]);

    if (base->n_counts[2] > 0)
        init_sorted_list(&model->sorted_bo_wt2);

    if (base->n_counts[1] > 0) {
        init_sorted_list(&model->sorted_prob2);

        if (ReadBigrams(&li, model) == -1) {
            fclose_comp(fp, is_pipe);
            ngram_model_free(base);
            return NULL;
        }

        base->n_counts[1] = FIRST_BG(model, base->n_counts[0]);
        model->lm3g.n_prob2 = model->sorted_prob2.free;
        model->lm3g.prob2 = vals_in_sorted_list(&model->sorted_prob2);
        free_sorted_list(&model->sorted_prob2);
        E_INFO("%8d = #bigrams created\n", base->n_counts[1]);
        E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2);
    }

    if (base->n_counts[2] > 0) {
        /* Create trigram bo-wts array */
        model->lm3g.n_bo_wt2 = model->sorted_bo_wt2.free;
        model->lm3g.bo_wt2 = vals_in_sorted_list(&model->sorted_bo_wt2);
        free_sorted_list(&model->sorted_bo_wt2);
        E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2);

        init_sorted_list(&model->sorted_prob3);

        if (ReadTrigrams(&li, model) == -1) {
            fclose_comp(fp, is_pipe);
            ngram_model_free(base);
            return NULL;
        }

        base->n_counts[2] = FIRST_TG(model, base->n_counts[1]);
        model->lm3g.n_prob3 = model->sorted_prob3.free;
        model->lm3g.prob3 = vals_in_sorted_list(&model->sorted_prob3);
        E_INFO("%8d = #trigrams created\n", base->n_counts[2]);
        E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3);

        free_sorted_list(&model->sorted_prob3);

        /* Initialize tginfo */
        model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *));
        model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
    }

    lineiter_free(li);
    fclose_comp(fp, is_pipe);
    return base;
}

int
ngram_model_arpa_write(ngram_model_t *model,
		       const char *file_name)
{
    ngram_iter_t *itor;
    FILE *fh;
    int i;

    if ((fh = fopen(file_name, "w")) == NULL) {
        E_ERROR_SYSTEM("Failed to open %s for writing", file_name);
        return -1;
    }
    fprintf(fh, "This is an ARPA-format language model file, generated by CMU Sphinx\n");

    /* The ARPA format doesn't require any extra information that
     * N-Gram iterators can't give us, so this is very
     * straightforward compared with DMP writing. */

    /* Write N-gram counts. */
    fprintf(fh, "\\data\\\n");
    for (i = 0; i < model->n; ++i) {
        fprintf(fh, "ngram %d=%d\n", i+1, model->n_counts[i]);
    }

    /* Write N-grams */
    for (i = 0; i < model->n; ++i) {
        fprintf(fh, "\n\\%d-grams:\n", i + 1);
        for (itor = ngram_model_mgrams(model, i); itor; itor = ngram_iter_next(itor)) {
            int32 const *wids;
            int32 score, bowt;
            int j;

            wids = ngram_iter_get(itor, &score, &bowt);
            fprintf(fh, "%.4f ", logmath_log_to_log10(model->lmath, score));
            for (j = 0; j <= i; ++j) {
                assert(wids[j] < model->n_counts[0]);
                fprintf(fh, "%s ", model->word_str[wids[j]]);
            }
            if (i < model->n-1)
                fprintf(fh, "%.4f", logmath_log_to_log10(model->lmath, bowt));
            fprintf(fh, "\n");
        }
    }
    fprintf(fh, "\n\\end\\\n");
    return fclose(fh);
}

static int
ngram_model_arpa_apply_weights(ngram_model_t *base, float32 lw,
                              float32 wip, float32 uw)
{
    ngram_model_arpa_t *model = (ngram_model_arpa_t *)base;
    lm3g_apply_weights(base, &model->lm3g, lw, wip, uw);
    return 0;
}

/* Lousy "templating" for things that are largely the same in DMP and
 * ARPA models, except for the bigram and trigram types and some
 * names. */
#define NGRAM_MODEL_TYPE ngram_model_arpa_t
#include "lm3g_templates.c"

static void
ngram_model_arpa_free(ngram_model_t *base)
{
    ngram_model_arpa_t *model = (ngram_model_arpa_t *)base;
    ckd_free(model->lm3g.unigrams);
    ckd_free(model->lm3g.bigrams);
    ckd_free(model->lm3g.trigrams);
    ckd_free(model->lm3g.prob2);
    ckd_free(model->lm3g.bo_wt2);
    ckd_free(model->lm3g.prob3);
    lm3g_tginfo_free(base, &model->lm3g);
    ckd_free(model->lm3g.tseg_base);
}

static ngram_funcs_t ngram_model_arpa_funcs = {
    ngram_model_arpa_free,          /* free */
    ngram_model_arpa_apply_weights, /* apply_weights */
    lm3g_template_score,            /* score */
    lm3g_template_raw_score,        /* raw_score */
    lm3g_template_add_ug,           /* add_ug */
    lm3g_template_flush,            /* flush */
    lm3g_template_iter,             /* iter */
    lm3g_template_mgrams,           /* mgrams */
    lm3g_template_successors,       /* successors */
    lm3g_template_iter_get,         /* iter_get */
    lm3g_template_iter_next,        /* iter_next */
    lm3g_template_iter_free         /* iter_free */
};