diff options
Diffstat (limited to 'media/sphinxbase/src/libsphinxbase/util/huff_code.c')
-rw-r--r-- | media/sphinxbase/src/libsphinxbase/util/huff_code.c | 651 |
1 files changed, 651 insertions, 0 deletions
diff --git a/media/sphinxbase/src/libsphinxbase/util/huff_code.c b/media/sphinxbase/src/libsphinxbase/util/huff_code.c new file mode 100644 index 000000000..dd3fb582d --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/util/huff_code.c @@ -0,0 +1,651 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 2009 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +#include <string.h> + +#include "sphinxbase/huff_code.h" +#include "sphinxbase/ckd_alloc.h" +#include "sphinxbase/hash_table.h" +#include "sphinxbase/byteorder.h" +#include "sphinxbase/heap.h" +#include "sphinxbase/pio.h" +#include "sphinxbase/err.h" + +typedef struct huff_node_s { + int nbits; + struct huff_node_s *l; + union { + int32 ival; + char *sval; + struct huff_node_s *r; + } r; +} huff_node_t; + +typedef struct huff_codeword_s { + union { + int32 ival; + char *sval; + } r; + uint32 nbits, codeword; +} huff_codeword_t; + +enum { + HUFF_CODE_INT, + HUFF_CODE_STR +}; + +struct huff_code_s { + int16 refcount; + uint8 maxbits; + uint8 type; + uint32 *firstcode; + uint32 *numl; + huff_codeword_t **syms; + hash_table_t *codewords; + FILE *fh; + bit_encode_t *be; + int boff; +}; + +static huff_node_t * +huff_node_new_int(int32 val) +{ + huff_node_t *hn = ckd_calloc(1, sizeof(*hn)); + hn->r.ival = val; + return hn; +} + +static huff_node_t * +huff_node_new_str(char const *val) +{ + huff_node_t *hn = ckd_calloc(1, sizeof(*hn)); + hn->r.sval = ckd_salloc(val); + return hn; +} + +static huff_node_t * +huff_node_new_parent(huff_node_t *l, huff_node_t *r) +{ + huff_node_t *hn = ckd_calloc(1, sizeof(*hn)); + hn->l = l; + hn->r.r = r; + /* Propagate maximum bit length. */ + if (r->nbits > l->nbits) + hn->nbits = r->nbits + 1; + else + hn->nbits = l->nbits + 1; + return hn; +} + +static void +huff_node_free_int(huff_node_t *root) +{ + if (root->l) { + huff_node_free_int(root->l); + huff_node_free_int(root->r.r); + } + ckd_free(root); +} + +static void +huff_node_free_str(huff_node_t *root, int freestr) +{ + if (root->l) { + huff_node_free_str(root->l, freestr); + huff_node_free_str(root->r.r, freestr); + } + else { + if (freestr) + ckd_free(root->r.sval); + } + ckd_free(root); +} + +static huff_node_t * +huff_code_build_tree(heap_t *q) +{ + huff_node_t *root = NULL; + int32 rf; + + while (heap_size(q) > 1) { + huff_node_t *l, *r, *p; + int32 lf, rf; + + heap_pop(q, (void *)&l, &lf); + heap_pop(q, (void *)&r, &rf); + p = huff_node_new_parent(l, r); + heap_insert(q, p, lf + rf); + } + heap_pop(q, (void **)&root, &rf); + return root; +} + +static void +huff_code_canonicalize(huff_code_t *hc, huff_node_t *root) +{ + glist_t agenda; + uint32 *nextcode; + int i, ncw; + + hc->firstcode = ckd_calloc(hc->maxbits+1, sizeof(*hc->firstcode)); + hc->syms = ckd_calloc(hc->maxbits+1, sizeof(*hc->syms)); + hc->numl = ckd_calloc(hc->maxbits+1, sizeof(*nextcode)); + nextcode = ckd_calloc(hc->maxbits+1, sizeof(*nextcode)); + + /* Traverse the tree, annotating it with the actual bit + * lengths, and histogramming them in numl. */ + root->nbits = 0; + ncw = 0; + agenda = glist_add_ptr(NULL, root); + while (agenda) { + huff_node_t *node = gnode_ptr(agenda); + agenda = gnode_free(agenda, NULL); + if (node->l) { + node->l->nbits = node->nbits + 1; + agenda = glist_add_ptr(agenda, node->l); + node->r.r->nbits = node->nbits + 1; + agenda = glist_add_ptr(agenda, node->r.r); + } + else { + hc->numl[node->nbits]++; + ncw++; + } + } + /* Create starting codes and symbol tables for each bit length. */ + hc->syms[hc->maxbits] = ckd_calloc(hc->numl[hc->maxbits], sizeof(**hc->syms)); + for (i = hc->maxbits - 1; i > 0; --i) { + hc->firstcode[i] = (hc->firstcode[i+1] + hc->numl[i+1]) / 2; + hc->syms[i] = ckd_calloc(hc->numl[i], sizeof(**hc->syms)); + } + memcpy(nextcode, hc->firstcode, (hc->maxbits + 1) * sizeof(*nextcode)); + /* Traverse the tree again to produce the codebook itself. */ + hc->codewords = hash_table_new(ncw, HASH_CASE_YES); + agenda = glist_add_ptr(NULL, root); + while (agenda) { + huff_node_t *node = gnode_ptr(agenda); + agenda = gnode_free(agenda, NULL); + if (node->l) { + agenda = glist_add_ptr(agenda, node->l); + agenda = glist_add_ptr(agenda, node->r.r); + } + else { + /* Initialize codebook entry, which also retains symbol pointer. */ + huff_codeword_t *cw; + uint32 codeword = nextcode[node->nbits] & ((1 << node->nbits) - 1); + cw = hc->syms[node->nbits] + (codeword - hc->firstcode[node->nbits]); + cw->nbits = node->nbits; + cw->r.sval = node->r.sval; /* Will copy ints too... */ + cw->codeword = codeword; + if (hc->type == HUFF_CODE_INT) { + hash_table_enter_bkey(hc->codewords, + (char const *)&cw->r.ival, + sizeof(cw->r.ival), + (void *)cw); + } + else { + hash_table_enter(hc->codewords, cw->r.sval, (void *)cw); + } + ++nextcode[node->nbits]; + } + } + ckd_free(nextcode); +} + +huff_code_t * +huff_code_build_int(int32 const *values, int32 const *frequencies, int nvals) +{ + huff_code_t *hc; + huff_node_t *root; + heap_t *q; + int i; + + hc = ckd_calloc(1, sizeof(*hc)); + hc->refcount = 1; + hc->type = HUFF_CODE_INT; + + /* Initialize the heap with nodes for each symbol. */ + q = heap_new(); + for (i = 0; i < nvals; ++i) { + heap_insert(q, + huff_node_new_int(values[i]), + frequencies[i]); + } + + /* Now build the tree, which gives us codeword lengths. */ + root = huff_code_build_tree(q); + heap_destroy(q); + if (root == NULL || root->nbits > 32) { + E_ERROR("Huffman trees currently limited to 32 bits\n"); + huff_node_free_int(root); + huff_code_free(hc); + return NULL; + } + + /* Build a canonical codebook. */ + hc->maxbits = root->nbits; + huff_code_canonicalize(hc, root); + + /* Tree no longer needed. */ + huff_node_free_int(root); + + return hc; +} + +huff_code_t * +huff_code_build_str(char * const *values, int32 const *frequencies, int nvals) +{ + huff_code_t *hc; + huff_node_t *root; + heap_t *q; + int i; + + hc = ckd_calloc(1, sizeof(*hc)); + hc->refcount = 1; + hc->type = HUFF_CODE_STR; + + /* Initialize the heap with nodes for each symbol. */ + q = heap_new(); + for (i = 0; i < nvals; ++i) { + heap_insert(q, + huff_node_new_str(values[i]), + frequencies[i]); + } + + /* Now build the tree, which gives us codeword lengths. */ + root = huff_code_build_tree(q); + heap_destroy(q); + if (root == NULL || root->nbits > 32) { + E_ERROR("Huffman trees currently limited to 32 bits\n"); + huff_node_free_str(root, TRUE); + huff_code_free(hc); + return NULL; + } + + /* Build a canonical codebook. */ + hc->maxbits = root->nbits; + huff_code_canonicalize(hc, root); + + /* Tree no longer needed (note we retain pointers to its strings). */ + huff_node_free_str(root, FALSE); + + return hc; +} + +huff_code_t * +huff_code_read(FILE *infh) +{ + huff_code_t *hc; + int i, j; + + hc = ckd_calloc(1, sizeof(*hc)); + hc->refcount = 1; + + hc->maxbits = fgetc(infh); + hc->type = fgetc(infh); + + /* Two bytes of padding. */ + fgetc(infh); + fgetc(infh); + + /* Allocate stuff. */ + hc->firstcode = ckd_calloc(hc->maxbits + 1, sizeof(*hc->firstcode)); + hc->numl = ckd_calloc(hc->maxbits + 1, sizeof(*hc->numl)); + hc->syms = ckd_calloc(hc->maxbits + 1, sizeof(*hc->syms)); + + /* Read the symbol tables. */ + hc->codewords = hash_table_new(hc->maxbits, HASH_CASE_YES); + for (i = 1; i <= hc->maxbits; ++i) { + if (fread(&hc->firstcode[i], 4, 1, infh) != 1) + goto error_out; + SWAP_BE_32(&hc->firstcode[i]); + if (fread(&hc->numl[i], 4, 1, infh) != 1) + goto error_out; + SWAP_BE_32(&hc->numl[i]); + hc->syms[i] = ckd_calloc(hc->numl[i], sizeof(**hc->syms)); + for (j = 0; j < hc->numl[i]; ++j) { + huff_codeword_t *cw = &hc->syms[i][j]; + cw->nbits = i; + cw->codeword = hc->firstcode[i] + j; + if (hc->type == HUFF_CODE_INT) { + if (fread(&cw->r.ival, 4, 1, infh) != 1) + goto error_out; + SWAP_BE_32(&cw->r.ival); + hash_table_enter_bkey(hc->codewords, + (char const *)&cw->r.ival, + sizeof(cw->r.ival), + (void *)cw); + } + else { + size_t len; + cw->r.sval = fread_line(infh, &len); + cw->r.sval[len-1] = '\0'; + hash_table_enter(hc->codewords, cw->r.sval, (void *)cw); + } + } + } + + return hc; +error_out: + huff_code_free(hc); + return NULL; +} + +int +huff_code_write(huff_code_t *hc, FILE *outfh) +{ + int i, j; + + /* Maximum codeword length */ + fputc(hc->maxbits, outfh); + /* Symbol type */ + fputc(hc->type, outfh); + /* Two extra bytes (for future use and alignment) */ + fputc(0, outfh); + fputc(0, outfh); + /* For each codeword length: */ + for (i = 1; i <= hc->maxbits; ++i) { + uint32 val; + + /* Starting code, number of codes. */ + val = hc->firstcode[i]; + /* Canonically big-endian (like the data itself) */ + SWAP_BE_32(&val); + fwrite(&val, 4, 1, outfh); + val = hc->numl[i]; + SWAP_BE_32(&val); + fwrite(&val, 4, 1, outfh); + + /* Symbols for each code (FIXME: Should compress these too) */ + for (j = 0; j < hc->numl[i]; ++j) { + if (hc->type == HUFF_CODE_INT) { + int32 val = hc->syms[i][j].r.ival; + SWAP_BE_32(&val); + fwrite(&val, 4, 1, outfh); + } + else { + /* Write them all separated by newlines, so that + * fgets() will read them for us. */ + fprintf(outfh, "%s\n", hc->syms[i][j].r.sval); + } + } + } + return 0; +} + +int +huff_code_dump_codebits(FILE *dumpfh, uint32 nbits, uint32 codeword) +{ + uint32 i; + + for (i = 0; i < nbits; ++i) + fputc((codeword & (1<<(nbits-i-1))) ? '1' : '0', dumpfh); + return 0; +} + +int +huff_code_dump(huff_code_t *hc, FILE *dumpfh) +{ + int i, j; + + /* Print out all codewords. */ + fprintf(dumpfh, "Maximum codeword length: %d\n", hc->maxbits); + fprintf(dumpfh, "Symbols are %s\n", (hc->type == HUFF_CODE_STR) ? "strings" : "ints"); + fprintf(dumpfh, "Codewords:\n"); + for (i = 1; i <= hc->maxbits; ++i) { + for (j = 0; j < hc->numl[i]; ++j) { + if (hc->type == HUFF_CODE_STR) + fprintf(dumpfh, "%-30s", hc->syms[i][j].r.sval); + else + fprintf(dumpfh, "%-30d", hc->syms[i][j].r.ival); + huff_code_dump_codebits(dumpfh, hc->syms[i][j].nbits, + hc->syms[i][j].codeword); + fprintf(dumpfh, "\n"); + } + } + return 0; +} + +huff_code_t * +huff_code_retain(huff_code_t *hc) +{ + ++hc->refcount; + return hc; +} + +int +huff_code_free(huff_code_t *hc) +{ + int i; + + if (hc == NULL) + return 0; + if (--hc->refcount > 0) + return hc->refcount; + for (i = 0; i <= hc->maxbits; ++i) { + int j; + for (j = 0; j < hc->numl[i]; ++j) { + if (hc->type == HUFF_CODE_STR) + ckd_free(hc->syms[i][j].r.sval); + } + ckd_free(hc->syms[i]); + } + ckd_free(hc->firstcode); + ckd_free(hc->numl); + ckd_free(hc->syms); + hash_table_free(hc->codewords); + ckd_free(hc); + return 0; +} + +FILE * +huff_code_attach(huff_code_t *hc, FILE *fh, char const *mode) +{ + FILE *oldfh = huff_code_detach(hc); + + hc->fh = fh; + if (mode[0] == 'w') + hc->be = bit_encode_attach(hc->fh); + return oldfh; +} + +FILE * +huff_code_detach(huff_code_t *hc) +{ + FILE *oldfh = hc->fh; + + if (hc->be) { + bit_encode_flush(hc->be); + bit_encode_free(hc->be); + hc->be = NULL; + } + hc->fh = NULL; + return oldfh; +} + +int +huff_code_encode_int(huff_code_t *hc, int32 sym, uint32 *outcw) +{ + huff_codeword_t *cw; + + if (hash_table_lookup_bkey(hc->codewords, + (char const *)&sym, + sizeof(sym), + (void **)&cw) < 0) + return 0; + if (hc->be) + bit_encode_write_cw(hc->be, cw->codeword, cw->nbits); + if (outcw) *outcw = cw->codeword; + return cw->nbits; +} + +int +huff_code_encode_str(huff_code_t *hc, char const *sym, uint32 *outcw) +{ + huff_codeword_t *cw; + + if (hash_table_lookup(hc->codewords, + sym, + (void **)&cw) < 0) + return 0; + if (hc->be) + bit_encode_write_cw(hc->be, cw->codeword, cw->nbits); + if (outcw) *outcw = cw->codeword; + return cw->nbits; +} + +static huff_codeword_t * +huff_code_decode_data(huff_code_t *hc, char const **inout_data, + size_t *inout_data_len, int *inout_offset) +{ + char const *data = *inout_data; + char const *end = data + *inout_data_len; + int offset = *inout_offset; + uint32 cw; + int cwlen; + int byte; + + if (data == end) + return NULL; + byte = *data++; + cw = !!(byte & (1 << (7-offset++))); + cwlen = 1; + /* printf("%.*x ", cwlen, cw); */ + while (cwlen <= hc->maxbits && cw < hc->firstcode[cwlen]) { + ++cwlen; + cw <<= 1; + if (offset > 7) { + if (data == end) + return NULL; + byte = *data++; + offset = 0; + } + cw |= !!(byte & (1 << (7-offset++))); + /* printf("%.*x ", cwlen, cw); */ + } + if (cwlen > hc->maxbits) /* FAIL: invalid data */ + return NULL; + + /* Put the last byte back if there are bits left over. */ + if (offset < 8) + --data; + else + offset = 0; + + /* printf("%.*x\n", cwlen, cw); */ + *inout_data_len = end - data; + *inout_data = data; + *inout_offset = offset; + return hc->syms[cwlen] + (cw - hc->firstcode[cwlen]); +} + +static huff_codeword_t * +huff_code_decode_fh(huff_code_t *hc) +{ + uint32 cw; + int cwlen; + int byte; + + if ((byte = fgetc(hc->fh)) == EOF) + return NULL; + cw = !!(byte & (1 << (7-hc->boff++))); + cwlen = 1; + /* printf("%.*x ", cwlen, cw); */ + while (cwlen <= hc->maxbits && cw < hc->firstcode[cwlen]) { + ++cwlen; + cw <<= 1; + if (hc->boff > 7) { + if ((byte = fgetc(hc->fh)) == EOF) + return NULL; + hc->boff = 0; + } + cw |= !!(byte & (1 << (7-hc->boff++))); + /* printf("%.*x ", cwlen, cw); */ + } + if (cwlen > hc->maxbits) /* FAIL: invalid data */ + return NULL; + + /* Put the last byte back if there are bits left over. */ + if (hc->boff < 8) + ungetc(byte, hc->fh); + else + hc->boff = 0; + + /* printf("%.*x\n", cwlen, cw); */ + return hc->syms[cwlen] + (cw - hc->firstcode[cwlen]); +} + +int +huff_code_decode_int(huff_code_t *hc, int *outval, + char const **inout_data, + size_t *inout_data_len, int *inout_offset) +{ + huff_codeword_t *cw; + + if (inout_data) + cw = huff_code_decode_data(hc, inout_data, inout_data_len, inout_offset); + else if (hc->fh) + cw = huff_code_decode_fh(hc); + else + return -1; + + if (cw == NULL) + return -1; + if (outval) + *outval = cw->r.ival; + + return 0; +} + +char const * +huff_code_decode_str(huff_code_t *hc, + char const **inout_data, + size_t *inout_data_len, int *inout_offset) +{ + huff_codeword_t *cw; + + if (inout_data) + cw = huff_code_decode_data(hc, inout_data, inout_data_len, inout_offset); + else if (hc->fh) + cw = huff_code_decode_fh(hc); + else + return NULL; + + if (cw == NULL) + return NULL; + + return cw->r.sval; +} |