1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
|
/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
/* ====================================================================
* Copyright (c) 1999-2013 Carnegie Mellon University. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
*
* THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* ====================================================================
*
*/
/*
* fsg_lextree.h -- The collection of all the lextrees for the entire FSM.
*
*/
#ifndef __S2_FSG_LEXTREE_H__
#define __S2_FSG_LEXTREE_H__
/* SphinxBase headers. */
#include <sphinxbase/cmd_ln.h>
#include <sphinxbase/fsg_model.h>
/* Local headers. */
#include "hmm.h"
#include "dict.h"
#include "dict2pid.h"
/*
* Compile-time constant determining the size of the
* bitvector fsg_pnode_t.fsg_pnode_ctxt_t.bv. (See below.)
* But it makes memory allocation simpler and more efficient.
* Make it smaller (2) to save memory if your phoneset has less than
* 64 phones.
*/
#define FSG_PNODE_CTXT_BVSZ 4
typedef struct {
uint32 bv[FSG_PNODE_CTXT_BVSZ];
} fsg_pnode_ctxt_t;
/*
* All transitions (words) out of any given FSG state represented are by a
* phonetic prefix lextree (except for epsilon or null transitions; they
* are not part of the lextree). Lextree leaf nodes represent individual
* FSG transitions, so no sharing is allowed at the leaf nodes. The FSG
* transition probs are distributed along the lextree: the prob at a node
* is the max of the probs of all leaf nodes (and, hence, FSG transitions)
* reachable from that node.
*
* To conserve memory, the underlying HMMs with state-level information are
* allocated only as needed. Root and leaf nodes must also account for all
* the possible phonetic contexts, with an independent HMM for each distinct
* context.
*/
typedef struct fsg_pnode_s {
/*
* If this is not a leaf node, the first successor (child) node. Otherwise
* the parent FSG transition for which this is the leaf node (for figuring
* the FSG destination state, and word emitted by the transition). A node
* may have several children. The succ ptr gives just the first; the rest
* are linked via the sibling ptr below.
*/
union {
struct fsg_pnode_s *succ;
fsg_link_t *fsglink;
} next;
/*
* For simplicity of memory management (i.e., freeing the pnodes), all
* pnodes allocated for all transitions out of a state are maintained in a
* linear linked list through the alloc_next pointer.
*/
struct fsg_pnode_s *alloc_next;
/*
* The next node that is also a child of the parent of this node; NULL if
* none.
*/
struct fsg_pnode_s *sibling;
/*
* The transition (log) probability to be incurred upon transitioning to
* this node. (Transition probabilities are really associated with the
* transitions. But a lextree node has exactly one incoming transition.
* Hence, the prob can be associated with the node.)
* This is a logs2(prob) value, and includes the language weight.
*/
int32 logs2prob;
/*
* The root and leaf positions associated with any transition have to deal
* with multiple phonetic contexts. However, different contexts may result
* in the same SSID (senone-seq ID), and can share a single pnode with that
* SSID. But the pnode should track the set of context CI phones that share
* it. Hence the fsg_pnode_ctxt_t bit-vector set-representation. (For
* simplicity of implementation, its size is a compile-time constant for
* now.) Single phone words would need a 2-D array of context, but that's
* too expensive. For now, they simply use SIL as right context, so only
* the left context is properly modelled.
* (For word-internal phones, this field is unused, of course.)
*/
fsg_pnode_ctxt_t ctxt;
uint16 ci_ext; /* This node's CIphone as viewed externally (context) */
uint8 ppos; /* Phoneme position in pronunciation */
uint8 leaf; /* Whether this is a leaf node */
/* HMM-state-level stuff here */
hmm_context_t *ctx;
hmm_t hmm;
} fsg_pnode_t;
/* Access macros */
#define fsg_pnode_leaf(p) ((p)->leaf)
#define fsg_pnode_logs2prob(p) ((p)->logs2prob)
#define fsg_pnode_succ(p) ((p)->next.succ)
#define fsg_pnode_fsglink(p) ((p)->next.fsglink)
#define fsg_pnode_sibling(p) ((p)->sibling)
#define fsg_pnode_hmmptr(p) (&((p)->hmm))
#define fsg_pnode_ci_ext(p) ((p)->ci_ext)
#define fsg_pnode_ppos(p) ((p)->ppos)
#define fsg_pnode_leaf(p) ((p)->leaf)
#define fsg_pnode_ctxt(p) ((p)->ctxt)
#define fsg_pnode_add_ctxt(p,c) ((p)->ctxt.bv[(c)>>5] |= (1 << ((c)&0x001f)))
/*
* The following is macroized because its called very frequently
* ::: uint32 fsg_pnode_ctxt_sub (fsg_pnode_ctxt_t *src, fsg_pnode_ctxt_t *sub);
*/
/*
* Subtract bitvector sub from bitvector src (src updated with the result).
* Return 0 if result is all 0, non-zero otherwise.
*/
#if (FSG_PNODE_CTXT_BVSZ == 1)
#define FSG_PNODE_CTXT_SUB(src,sub) \
((src)->bv[0] = (~((sub)->bv[0]) & (src)->bv[0]))
#elif (FSG_PNODE_CTXT_BVSZ == 2)
#define FSG_PNODE_CTXT_SUB(src,sub) \
(((src)->bv[0] = (~((sub)->bv[0]) & (src)->bv[0])) | \
((src)->bv[1] = (~((sub)->bv[1]) & (src)->bv[1])))
#elif (FSG_PNODE_CTXT_BVSZ == 4)
#define FSG_PNODE_CTXT_SUB(src,sub) \
(((src)->bv[0] = (~((sub)->bv[0]) & (src)->bv[0])) | \
((src)->bv[1] = (~((sub)->bv[1]) & (src)->bv[1])) | \
((src)->bv[2] = (~((sub)->bv[2]) & (src)->bv[2])) | \
((src)->bv[3] = (~((sub)->bv[3]) & (src)->bv[3])))
#else
#define FSG_PNODE_CTXT_SUB(src,sub) fsg_pnode_ctxt_sub_generic((src),(sub))
#endif
/**
* Collection of lextrees for an FSG.
*/
typedef struct fsg_lextree_s {
fsg_model_t *fsg; /**< The fsg for which this lextree is built. */
hmm_context_t *ctx; /**< HMM context structure. */
dict_t *dict; /**< Pronunciation dictionary for this FSG. */
dict2pid_t *d2p; /**< Context-dependent phone mappings for this FSG. */
bin_mdef_t *mdef; /**< Model definition (triphone mappings). */
/*
* Left and right CIphone sets for each state.
* Left context CIphones for a state S: If word W transitions into S, W's
* final CIphone is in S's {lc}. Words transitioning out of S must consider
* these left context CIphones.
* Similarly, right contexts for state S: If word W transitions out of S,
* W's first CIphone is in S's {rc}. Words transitioning into S must consider
* these right contexts.
*
* NOTE: Words may transition into and out of S INDIRECTLY, with intermediate
* null transitions.
* NOTE: Single-phone words are difficult; only SILENCE right context is
* modelled for them.
* NOTE: Non-silence filler phones aren't included in these sets. Filler
* words don't use context, and present the SILENCE phone as context to
* adjacent words.
*/
int16 **lc; /**< Left context triphone mappings for FSG. */
int16 **rc; /**< Right context triphone mappings for FSG. */
fsg_pnode_t **root; /* root[s] = lextree representing all transitions
out of state s. Note that the "tree" for each
state is actually a collection of trees, linked
via fsg_pnode_t.sibling (root[s]->sibling) */
fsg_pnode_t **alloc_head; /* alloc_head[s] = head of linear list of all
pnodes allocated for state s */
int32 n_pnode; /* #HMM nodes in search structure */
int32 wip;
int32 pip;
} fsg_lextree_t;
/* Access macros */
#define fsg_lextree_root(lt,s) ((lt)->root[s])
#define fsg_lextree_n_pnode(lt) ((lt)->n_pnode)
/**
* Create, initialize, and return a new phonetic lextree for the given FSG.
*/
fsg_lextree_t *fsg_lextree_init(fsg_model_t *fsg, dict_t *dict,
dict2pid_t *d2p,
bin_mdef_t *mdef, hmm_context_t *ctx,
int32 wip, int32 pip);
/**
* Free lextrees for an FSG.
*/
void fsg_lextree_free(fsg_lextree_t *fsg);
/**
* Print an FSG lextree to a file for debugging.
*/
void fsg_lextree_dump(fsg_lextree_t *fsg, FILE *fh);
/**
* Mark the given pnode as inactive (for search).
*/
void fsg_psubtree_pnode_deactivate(fsg_pnode_t *pnode);
/**
* Set all flags on in the given context bitvector.
*/
void fsg_pnode_add_all_ctxt(fsg_pnode_ctxt_t *ctxt);
/**
* Generic variant for arbitrary size
*/
uint32 fsg_pnode_ctxt_sub_generic(fsg_pnode_ctxt_t *src, fsg_pnode_ctxt_t *sub);
#endif
|