summaryrefslogtreecommitdiffstats
path: root/intl/hyphenation/hyphen/hyphen.c
diff options
context:
space:
mode:
Diffstat (limited to 'intl/hyphenation/hyphen/hyphen.c')
-rw-r--r--intl/hyphenation/hyphen/hyphen.c1187
1 files changed, 1187 insertions, 0 deletions
diff --git a/intl/hyphenation/hyphen/hyphen.c b/intl/hyphenation/hyphen/hyphen.c
new file mode 100644
index 000000000..9a132d026
--- /dev/null
+++ b/intl/hyphenation/hyphen/hyphen.c
@@ -0,0 +1,1187 @@
+/* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
+ * licenses follows.
+ */
+
+/* LibHnj - a library for high quality hyphenation and justification
+ * Copyright (C) 1998 Raph Levien,
+ * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
+ * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
+ * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307 USA.
+*/
+
+/*
+ * The contents of this file are subject to the Mozilla Public License
+ * Version 1.0 (the "MPL"); you may not use this file except in
+ * compliance with the MPL. You may obtain a copy of the MPL at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the MPL is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
+ * for the specific language governing rights and limitations under the
+ * MPL.
+ *
+ */
+#include <stdlib.h> /* for NULL, malloc */
+#include <stdio.h> /* for fprintf */
+#include <string.h> /* for strdup */
+#include <limits.h> /* for INT_MAX */
+
+#ifdef UNX
+#include <unistd.h> /* for exit */
+#endif
+
+#define noVERBOSE
+
+/* calculate hyphenmin values with long ligature length (2 or 3 characters
+ * instead of 1 or 2) for comparison with hyphenation without ligatures */
+#define noLONG_LIGATURE
+
+#ifdef LONG_LIGATURE
+#define LIG_xx 1
+#define LIG_xxx 2
+#else
+#define LIG_xx 0
+#define LIG_xxx 1
+#endif
+
+#include "hnjalloc.h"
+#include "hyphen.h"
+
+static char *
+hnj_strdup (const char *s)
+{
+ char *newstr;
+ int l;
+
+ l = strlen (s);
+ newstr = (char *) hnj_malloc (l + 1);
+ memcpy (newstr, s, l);
+ newstr[l] = 0;
+ return newstr;
+}
+
+/* remove cross-platform text line end characters */
+void hnj_strchomp(char * s)
+{
+ int k = strlen(s);
+ if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
+ if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
+}
+
+/* a little bit of a hash table implementation. This simply maps strings
+ to state numbers */
+
+typedef struct _HashTab HashTab;
+typedef struct _HashEntry HashEntry;
+
+/* A cheap, but effective, hack. */
+#define HASH_SIZE 31627
+
+struct _HashTab {
+ HashEntry *entries[HASH_SIZE];
+};
+
+struct _HashEntry {
+ HashEntry *next;
+ char *key;
+ int val;
+};
+
+/* a char* hash function from ASU - adapted from Gtk+ */
+static unsigned int
+hnj_string_hash (const char *s)
+{
+ const char *p;
+ unsigned int h=0, g;
+ for(p = s; *p != '\0'; p += 1) {
+ h = ( h << 4 ) + *p;
+ if ( ( g = h & 0xf0000000 ) ) {
+ h = h ^ (g >> 24);
+ h = h ^ g;
+ }
+ }
+ return h /* % M */;
+}
+
+static HashTab *
+hnj_hash_new (void)
+{
+ HashTab *hashtab;
+ int i;
+
+ hashtab = (HashTab *) hnj_malloc (sizeof(HashTab));
+ for (i = 0; i < HASH_SIZE; i++)
+ hashtab->entries[i] = NULL;
+
+ return hashtab;
+}
+
+static void
+hnj_hash_free (HashTab *hashtab)
+{
+ int i;
+ HashEntry *e, *next;
+
+ for (i = 0; i < HASH_SIZE; i++)
+ for (e = hashtab->entries[i]; e; e = next)
+ {
+ next = e->next;
+ hnj_free (e->key);
+ hnj_free (e);
+ }
+
+ hnj_free (hashtab);
+}
+
+/* assumes that key is not already present! */
+static void
+hnj_hash_insert (HashTab *hashtab, const char *key, int val)
+{
+ int i;
+ HashEntry *e;
+
+ i = hnj_string_hash (key) % HASH_SIZE;
+ e = (HashEntry *) hnj_malloc (sizeof(HashEntry));
+ e->next = hashtab->entries[i];
+ e->key = hnj_strdup (key);
+ e->val = val;
+ hashtab->entries[i] = e;
+}
+
+/* return val if found, otherwise -1 */
+static int
+hnj_hash_lookup (HashTab *hashtab, const char *key)
+{
+ int i;
+ HashEntry *e;
+ i = hnj_string_hash (key) % HASH_SIZE;
+ for (e = hashtab->entries[i]; e; e = e->next)
+ if (!strcmp (key, e->key))
+ return e->val;
+ return -1;
+}
+
+/* Get the state number, allocating a new state if necessary. */
+static int
+hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
+{
+ int state_num;
+
+ state_num = hnj_hash_lookup (hashtab, string);
+
+ if (state_num >= 0)
+ return state_num;
+
+ hnj_hash_insert (hashtab, string, dict->num_states);
+ /* predicate is true if dict->num_states is a power of two */
+ if (!(dict->num_states & (dict->num_states - 1)))
+ {
+ dict->states = (HyphenState *) hnj_realloc (dict->states,
+ (dict->num_states << 1) *
+ sizeof(HyphenState));
+ }
+ dict->states[dict->num_states].match = NULL;
+ dict->states[dict->num_states].repl = NULL;
+ dict->states[dict->num_states].fallback_state = -1;
+ dict->states[dict->num_states].num_trans = 0;
+ dict->states[dict->num_states].trans = NULL;
+ return dict->num_states++;
+}
+
+/* add a transition from state1 to state2 through ch - assumes that the
+ transition does not already exist */
+static void
+hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
+{
+ int num_trans;
+
+ num_trans = dict->states[state1].num_trans;
+ if (num_trans == 0)
+ {
+ dict->states[state1].trans = (HyphenTrans *) hnj_malloc (sizeof(HyphenTrans));
+ }
+ else if (!(num_trans & (num_trans - 1)))
+ {
+ dict->states[state1].trans = (HyphenTrans *) hnj_realloc (dict->states[state1].trans,
+ (num_trans << 1) *
+ sizeof(HyphenTrans));
+ }
+ dict->states[state1].trans[num_trans].ch = ch;
+ dict->states[state1].trans[num_trans].new_state = state2;
+ dict->states[state1].num_trans++;
+}
+
+#ifdef VERBOSE
+HashTab *global[1];
+
+static char *
+get_state_str (int state, int level)
+{
+ int i;
+ HashEntry *e;
+
+ for (i = 0; i < HASH_SIZE; i++)
+ for (e = global[level]->entries[i]; e; e = e->next)
+ if (e->val == state)
+ return e->key;
+ return NULL;
+}
+#endif
+
+void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) {
+ int i, j;
+ char word[MAX_CHARS];
+ char pattern[MAX_CHARS];
+ char * repl;
+ signed char replindex;
+ signed char replcut;
+ int state_num = 0;
+ int last_state;
+ char ch;
+ int found;
+
+ if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
+ dict->lhmin = atoi(buf + 13);
+ return;
+ } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
+ dict->rhmin = atoi(buf + 14);
+ return;
+ } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
+ dict->clhmin = atoi(buf + 21);
+ return;
+ } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
+ dict->crhmin = atoi(buf + 22);
+ return;
+ } else if (strncmp(buf, "NOHYPHEN", 8) == 0) {
+ char * space = buf + 8;
+ while (*space != '\0' && (*space == ' ' || *space == '\t')) space++;
+ if (*buf != '\0') dict->nohyphen = hnj_strdup(space);
+ if (dict->nohyphen) {
+ char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1;
+ *nhe = 0;
+ for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) {
+ if (*nhe == ',') {
+ dict->nohyphenl++;
+ *nhe = 0;
+ }
+ }
+ }
+ return;
+ }
+ j = 0;
+ pattern[j] = '0';
+ repl = strchr(buf, '/');
+ replindex = 0;
+ replcut = 0;
+ if (repl) {
+ char * index = strchr(repl + 1, ',');
+ *repl = '\0';
+ if (index) {
+ char * index2 = strchr(index + 1, ',');
+ *index = '\0';
+ if (index2) {
+ *index2 = '\0';
+ replindex = (signed char) atoi(index + 1) - 1;
+ replcut = (signed char) atoi(index2 + 1);
+ }
+ } else {
+ hnj_strchomp(repl + 1);
+ replindex = 0;
+ replcut = (signed char) strlen(buf);
+ }
+ repl = hnj_strdup(repl + 1);
+ }
+ for (i = 0; (unsigned char)buf[i] > (unsigned char)' '; i++)
+ {
+ if (buf[i] >= '0' && buf[i] <= '9')
+ pattern[j] = buf[i];
+ else
+ {
+ word[j] = buf[i];
+ pattern[++j] = '0';
+ }
+ }
+ word[j] = '\0';
+ pattern[j + 1] = '\0';
+
+ i = 0;
+ if (!repl) {
+ /* Optimize away leading zeroes */
+ for (; pattern[i] == '0'; i++);
+ } else {
+ if (*word == '.') i++;
+ /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
+ if (dict->utf8) {
+ int pu = -1; /* unicode character position */
+ int ps = -1; /* unicode start position (original replindex) */
+ size_t pc = (*word == '.') ? 1: 0; /* 8-bit character position */
+ for (; pc < (strlen(word) + 1); pc++) {
+ /* beginning of an UTF-8 character (not '10' start bits) */
+ if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
+ if ((ps < 0) && (replindex == pu)) {
+ ps = replindex;
+ replindex = (signed char) pc;
+ }
+ if ((ps >= 0) && ((pu - ps) == replcut)) {
+ replcut = (signed char) (pc - replindex);
+ break;
+ }
+ }
+ if (*word == '.') replindex--;
+ }
+ }
+
+#ifdef VERBOSE
+ printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl);
+#endif
+ found = hnj_hash_lookup (hashtab, word);
+ state_num = hnj_get_state (dict, hashtab, word);
+ dict->states[state_num].match = hnj_strdup (pattern + i);
+ dict->states[state_num].repl = repl;
+ dict->states[state_num].replindex = replindex;
+ if (!replcut) {
+ dict->states[state_num].replcut = (signed char) strlen(word);
+ } else {
+ dict->states[state_num].replcut = replcut;
+ }
+
+ /* now, put in the prefix transitions */
+ for (; found < 0 && j > 0; --j)
+ {
+ last_state = state_num;
+ ch = word[j - 1];
+ word[j - 1] = '\0';
+ found = hnj_hash_lookup (hashtab, word);
+ state_num = hnj_get_state (dict, hashtab, word);
+ hnj_add_trans (dict, state_num, last_state, ch);
+ }
+}
+
+HyphenDict *
+hnj_hyphen_load (const char *fn)
+{
+ HyphenDict *result;
+ FILE *f;
+ f = fopen (fn, "r");
+ if (f == NULL)
+ return NULL;
+
+ result = hnj_hyphen_load_file(f);
+
+ fclose(f);
+ return result;
+}
+
+HyphenDict *
+hnj_hyphen_load_file (FILE *f)
+{
+ HyphenDict *dict[2];
+ HashTab *hashtab;
+ char buf[MAX_CHARS];
+ int nextlevel = 0;
+ int i, j, k;
+ HashEntry *e;
+ int state_num = 0;
+/* loading one or two dictionaries (separated by NEXTLEVEL keyword) */
+for (k = 0; k < 2; k++) {
+ hashtab = hnj_hash_new ();
+#ifdef VERBOSE
+ global[k] = hashtab;
+#endif
+ hnj_hash_insert (hashtab, "", 0);
+ dict[k] = (HyphenDict *) hnj_malloc (sizeof(HyphenDict));
+ dict[k]->num_states = 1;
+ dict[k]->states = (HyphenState *) hnj_malloc (sizeof(HyphenState));
+ dict[k]->states[0].match = NULL;
+ dict[k]->states[0].repl = NULL;
+ dict[k]->states[0].fallback_state = -1;
+ dict[k]->states[0].num_trans = 0;
+ dict[k]->states[0].trans = NULL;
+ dict[k]->nextlevel = NULL;
+ dict[k]->lhmin = 0;
+ dict[k]->rhmin = 0;
+ dict[k]->clhmin = 0;
+ dict[k]->crhmin = 0;
+ dict[k]->nohyphen = NULL;
+ dict[k]->nohyphenl = 0;
+
+ /* read in character set info */
+ if (k == 0) {
+ for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
+ if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) {
+ for (i=0;i<MAX_NAME;i++)
+ if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
+ dict[k]->cset[i] = 0;
+ } else {
+ dict[k]->cset[0] = 0;
+ }
+ dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
+ } else {
+ strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1);
+ dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0';
+ dict[k]->utf8 = dict[0]->utf8;
+ }
+
+ if (k == 0 || nextlevel) {
+ while (fgets (buf, sizeof(buf), f) != NULL) {
+ if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
+ nextlevel = 1;
+ break;
+ } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab);
+ }
+ } else if (k == 1) {
+ /* default first level: hyphen and ASCII apostrophe */
+ if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab);
+ else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab);
+ strncpy(buf, "1-1\n", MAX_CHARS-1); /* buf rewritten by hnj_hyphen_load here */
+ buf[MAX_CHARS-1] = '\0';
+ hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */
+ hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */
+ if (dict[0]->utf8) {
+ hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */
+ hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */
+ }
+ }
+
+ /* Could do unioning of matches here (instead of the preprocessor script).
+ If we did, the pseudocode would look something like this:
+
+ foreach state in the hash table
+ foreach i = [1..length(state) - 1]
+ state to check is substr (state, i)
+ look it up
+ if found, and if there is a match, union the match in.
+
+ It's also possible to avoid the quadratic blowup by doing the
+ search in order of increasing state string sizes - then you
+ can break the loop after finding the first match.
+
+ This step should be optional in any case - if there is a
+ preprocessed rule table, it's always faster to use that.
+
+*/
+
+ /* put in the fallback states */
+ for (i = 0; i < HASH_SIZE; i++)
+ for (e = hashtab->entries[i]; e; e = e->next)
+ {
+ if (*(e->key)) for (j = 1; 1; j++)
+ {
+ state_num = hnj_hash_lookup (hashtab, e->key + j);
+ if (state_num >= 0)
+ break;
+ }
+ /* KBH: FIXME state 0 fallback_state should always be -1? */
+ if (e->val)
+ dict[k]->states[e->val].fallback_state = state_num;
+ }
+#ifdef VERBOSE
+ for (i = 0; i < HASH_SIZE; i++)
+ for (e = hashtab->entries[i]; e; e = e->next)
+ {
+ printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
+ dict[k]->states[e->val].fallback_state);
+ for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
+ printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
+ dict[k]->states[e->val].trans[j].new_state);
+ }
+#endif
+
+#ifndef VERBOSE
+ hnj_hash_free (hashtab);
+#endif
+ state_num = 0;
+}
+ if (nextlevel) dict[0]->nextlevel = dict[1];
+ else {
+ dict[1] -> nextlevel = dict[0];
+ dict[1]->lhmin = dict[0]->lhmin;
+ dict[1]->rhmin = dict[0]->rhmin;
+ dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3);
+ dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3);
+#ifdef VERBOSE
+ HashTab *r = global[0];
+ global[0] = global[1];
+ global[1] = r;
+#endif
+ return dict[1];
+ }
+ return dict[0];
+}
+
+void hnj_hyphen_free (HyphenDict *dict)
+{
+ int state_num;
+ HyphenState *hstate;
+
+ for (state_num = 0; state_num < dict->num_states; state_num++)
+ {
+ hstate = &dict->states[state_num];
+ if (hstate->match)
+ hnj_free (hstate->match);
+ if (hstate->repl)
+ hnj_free (hstate->repl);
+ if (hstate->trans)
+ hnj_free (hstate->trans);
+ }
+ if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
+
+ if (dict->nohyphen) hnj_free(dict->nohyphen);
+
+ hnj_free (dict->states);
+
+ hnj_free (dict);
+}
+
+#define MAX_WORD 256
+
+int hnj_hyphen_hyphenate (HyphenDict *dict,
+ const char *word, int word_size,
+ char *hyphens)
+{
+ char *prep_word;
+ int i, j, k;
+ int state;
+ char ch;
+ HyphenState *hstate;
+ char *match;
+ int offset;
+
+ prep_word = (char*) hnj_malloc (word_size + 3);
+
+ j = 0;
+ prep_word[j++] = '.';
+
+ for (i = 0; i < word_size; i++) {
+ if (word[i] <= '9' && word[i] >= '0') {
+ prep_word[j++] = '.';
+ } else {
+ prep_word[j++] = word[i];
+ }
+ }
+
+ prep_word[j++] = '.';
+ prep_word[j] = '\0';
+
+ for (i = 0; i < word_size + 5; i++)
+ hyphens[i] = '0';
+
+#ifdef VERBOSE
+ printf ("prep_word = %s\n", prep_word);
+#endif
+
+ /* now, run the finite state machine */
+ state = 0;
+ for (i = 0; i < j; i++)
+ {
+ ch = prep_word[i];
+ for (;;)
+ {
+
+ if (state == -1) {
+ /* return 1; */
+ /* KBH: FIXME shouldn't this be as follows? */
+ state = 0;
+ goto try_next_letter;
+ }
+
+#ifdef VERBOSE
+ char *state_str;
+ state_str = get_state_str (state, 0);
+
+ for (k = 0; k < i - strlen (state_str); k++)
+ putchar (' ');
+ printf ("%s", state_str);
+#endif
+
+ hstate = &dict->states[state];
+ for (k = 0; k < hstate->num_trans; k++)
+ if (hstate->trans[k].ch == ch)
+ {
+ state = hstate->trans[k].new_state;
+ goto found_state;
+ }
+ state = hstate->fallback_state;
+#ifdef VERBOSE
+ printf (" falling back, fallback_state %d\n", state);
+#endif
+ }
+ found_state:
+#ifdef VERBOSE
+ printf ("found state %d\n",state);
+#endif
+ /* Additional optimization is possible here - especially,
+ elimination of trailing zeroes from the match. Leading zeroes
+ have already been optimized. */
+ match = dict->states[state].match;
+ /* replacing rules not handled by hyphen_hyphenate() */
+ if (match && !dict->states[state].repl)
+ {
+ offset = i + 1 - strlen (match);
+#ifdef VERBOSE
+ for (k = 0; k < offset; k++)
+ putchar (' ');
+ printf ("%s\n", match);
+#endif
+ /* This is a linear search because I tried a binary search and
+ found it to be just a teeny bit slower. */
+ for (k = 0; match[k]; k++)
+ if (hyphens[offset + k] < match[k])
+ hyphens[offset + k] = match[k];
+ }
+
+ /* KBH: we need this to make sure we keep looking in a word */
+ /* for patterns even if the current character is not known in state 0 */
+ /* since patterns for hyphenation may occur anywhere in the word */
+ try_next_letter: ;
+
+ }
+#ifdef VERBOSE
+ for (i = 0; i < j; i++)
+ putchar (hyphens[i]);
+ putchar ('\n');
+#endif
+
+ for (i = 0; i < j - 4; i++)
+#if 0
+ if (hyphens[i + 1] & 1)
+ hyphens[i] = '-';
+#else
+ hyphens[i] = hyphens[i + 1];
+#endif
+ hyphens[0] = '0';
+ for (; i < word_size; i++)
+ hyphens[i] = '0';
+ hyphens[word_size] = '\0';
+
+ hnj_free (prep_word);
+
+ return 0;
+}
+
+/* Unicode ligature length */
+int hnj_ligature(unsigned char c) {
+ switch (c) {
+ case 0x80: /* ff */
+ case 0x81: /* fi */
+ case 0x82: return LIG_xx; /* fl */
+ case 0x83: /* ffi */
+ case 0x84: return LIG_xxx; /* ffl */
+ case 0x85: /* long st */
+ case 0x86: return LIG_xx; /* st */
+ }
+ return 0;
+}
+
+/* character length of the first n byte of the input word */
+int hnj_hyphen_strnlen(const char * word, int n, int utf8)
+{
+ int i = 0;
+ int j = 0;
+ while (j < n && word[j] != '\0') {
+ i++;
+ /* Unicode ligature support */
+ if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
+ i += hnj_ligature(word[j + 2]);
+ }
+ for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
+ }
+ return i;
+}
+
+int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
+ char *** rep, int ** pos, int ** cut, int lhmin)
+{
+ int i = 1, j;
+
+ /* Unicode ligature support */
+ if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) {
+ i += hnj_ligature(word[2]);
+ }
+
+ /* ignore numbers */
+ for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--;
+
+ for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
+ /* check length of the non-standard part */
+ if (*rep && *pos && *cut && (*rep)[j]) {
+ char * rh = strchr((*rep)[j], '=');
+ if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
+ hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
+ free((*rep)[j]);
+ (*rep)[j] = NULL;
+ hyphens[j] = '0';
+ }
+ } else {
+ hyphens[j] = '0';
+ }
+ j++;
+
+ /* Unicode ligature support */
+ if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
+ i += hnj_ligature(word[j + 2]);
+ }
+ } while (utf8 && (word[j] & 0xc0) == 0x80);
+ return 0;
+}
+
+int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
+ char *** rep, int ** pos, int ** cut, int rhmin)
+{
+ int i = 0;
+ int j;
+
+ /* ignore numbers */
+ for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--;
+
+ for (j = word_size - 1; i < rhmin && j > 0; j--) {
+ /* check length of the non-standard part */
+ if (*rep && *pos && *cut && (*rep)[j]) {
+ char * rh = strchr((*rep)[j], '=');
+ if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
+ hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
+ free((*rep)[j]);
+ (*rep)[j] = NULL;
+ hyphens[j] = '0';
+ }
+ } else {
+ hyphens[j] = '0';
+ }
+ if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++;
+ }
+ return 0;
+}
+
+/* recursive function for compound level hyphenation */
+int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
+ char * hyphens, char *** rep, int ** pos, int ** cut,
+ int clhmin, int crhmin, int lend, int rend)
+{
+ char *prep_word;
+ int i, j, k;
+ int state;
+ char ch;
+ HyphenState *hstate;
+ char *match;
+ char *repl;
+ signed char replindex;
+ signed char replcut;
+ int offset;
+ int * matchlen;
+ int * matchindex;
+ char ** matchrepl;
+ int isrepl = 0;
+ int nHyphCount;
+
+ size_t prep_word_size = word_size + 3;
+ prep_word = (char*) hnj_malloc (prep_word_size);
+ matchlen = (int*) hnj_malloc ((word_size + 3) * sizeof(int));
+ matchindex = (int*) hnj_malloc ((word_size + 3) * sizeof(int));
+ matchrepl = (char**) hnj_malloc ((word_size + 3) * sizeof(char *));
+
+ j = 0;
+ prep_word[j++] = '.';
+
+ for (i = 0; i < word_size; i++) {
+ if (word[i] <= '9' && word[i] >= '0') {
+ prep_word[j++] = '.';
+ } else {
+ prep_word[j++] = word[i];
+ }
+ }
+
+
+
+ prep_word[j++] = '.';
+ prep_word[j] = '\0';
+
+ for (i = 0; i < j; i++)
+ hyphens[i] = '0';
+
+#ifdef VERBOSE
+ printf ("prep_word = %s\n", prep_word);
+#endif
+
+ /* now, run the finite state machine */
+ state = 0;
+ for (i = 0; i < j; i++)
+ {
+ ch = prep_word[i];
+ for (;;)
+ {
+
+ if (state == -1) {
+ /* return 1; */
+ /* KBH: FIXME shouldn't this be as follows? */
+ state = 0;
+ goto try_next_letter;
+ }
+
+#ifdef VERBOSE
+ char *state_str;
+ state_str = get_state_str (state, 1);
+
+ for (k = 0; k < i - strlen (state_str); k++)
+ putchar (' ');
+ printf ("%s", state_str);
+#endif
+
+ hstate = &dict->states[state];
+ for (k = 0; k < hstate->num_trans; k++)
+ if (hstate->trans[k].ch == ch)
+ {
+ state = hstate->trans[k].new_state;
+ goto found_state;
+ }
+ state = hstate->fallback_state;
+#ifdef VERBOSE
+ printf (" falling back, fallback_state %d\n", state);
+#endif
+ }
+ found_state:
+#ifdef VERBOSE
+ printf ("found state %d\n",state);
+#endif
+ /* Additional optimization is possible here - especially,
+ elimination of trailing zeroes from the match. Leading zeroes
+ have already been optimized. */
+ match = dict->states[state].match;
+ repl = dict->states[state].repl;
+ replindex = dict->states[state].replindex;
+ replcut = dict->states[state].replcut;
+ /* replacing rules not handled by hyphen_hyphenate() */
+ if (match)
+ {
+ offset = i + 1 - strlen (match);
+#ifdef VERBOSE
+ for (k = 0; k < offset; k++)
+ putchar (' ');
+ printf ("%s (%s)\n", match, repl);
+#endif
+ if (repl) {
+ if (!isrepl) for(; isrepl < word_size; isrepl++) {
+ matchrepl[isrepl] = NULL;
+ matchindex[isrepl] = -1;
+ }
+ matchlen[offset + replindex] = replcut;
+ }
+ /* This is a linear search because I tried a binary search and
+ found it to be just a teeny bit slower. */
+ for (k = 0; match[k]; k++) {
+ if ((hyphens[offset + k] < match[k])) {
+ hyphens[offset + k] = match[k];
+ if (match[k]&1) {
+ matchrepl[offset + k] = repl;
+ if (repl && (k >= replindex) && (k <= replindex + replcut)) {
+ matchindex[offset + replindex] = offset + k;
+ }
+ }
+ }
+ }
+
+ }
+
+ /* KBH: we need this to make sure we keep looking in a word */
+ /* for patterns even if the current character is not known in state 0 */
+ /* since patterns for hyphenation may occur anywhere in the word */
+ try_next_letter: ;
+
+ }
+#ifdef VERBOSE
+ for (i = 0; i < j; i++)
+ putchar (hyphens[i]);
+ putchar ('\n');
+#endif
+
+ for (i = 0; i < j - 3; i++)
+#if 0
+ if (hyphens[i + 1] & 1)
+ hyphens[i] = '-';
+#else
+ hyphens[i] = hyphens[i + 1];
+#endif
+ for (; i < word_size; i++)
+ hyphens[i] = '0';
+ hyphens[word_size] = '\0';
+
+ /* now create a new char string showing hyphenation positions */
+ /* count the hyphens and allocate space for the new hyphenated string */
+ nHyphCount = 0;
+ for (i = 0; i < word_size; i++)
+ if (hyphens[i]&1)
+ nHyphCount++;
+ j = 0;
+ for (i = 0; i < word_size; i++) {
+ if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {
+ if (rep && pos && cut) {
+ if (!*rep)
+ *rep = (char **) calloc(word_size, sizeof(char *));
+ if (!*pos)
+ *pos = (int *) calloc(word_size, sizeof(int));
+ if (!*cut) {
+ *cut = (int *) calloc(word_size, sizeof(int));
+ }
+ (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
+ (*pos)[matchindex[i] - 1] = matchindex[i] - i;
+ (*cut)[matchindex[i] - 1] = matchlen[i];
+ }
+ j += strlen(matchrepl[matchindex[i]]);
+ i += matchlen[i] - 1;
+ }
+ }
+
+ hnj_free (matchrepl);
+ hnj_free (matchlen);
+ hnj_free (matchindex);
+
+ /* recursive hyphenation of the first (compound) level segments */
+ if (dict->nextlevel) {
+ char ** rep2;
+ int * pos2;
+ int * cut2;
+ char * hyphens2;
+ int begin = 0;
+
+ rep2 = (char**) hnj_malloc (word_size * sizeof(char *));
+ pos2 = (int*) hnj_malloc (word_size * sizeof(int));
+ cut2 = (int*) hnj_malloc (word_size * sizeof(int));
+ hyphens2 = (char*) hnj_malloc (word_size + 3);
+ for (i = 0; i < word_size; i++) rep2[i] = NULL;
+ for (i = 0; i < word_size; i++) if
+ (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
+ if (i - begin > 0) {
+ int hyph = 0;
+ prep_word[i + 2] = '\0';
+ /* non-standard hyphenation at compound boundary (Schiffahrt) */
+ if (rep && *rep && *pos && *cut && (*rep)[i]) {
+ char * l = strchr((*rep)[i], '=');
+ size_t offset = 2 + i - (*pos)[i];
+ strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1);
+ prep_word[prep_word_size - 1] = '\0';
+ if (l) {
+ hyph = (l - (*rep)[i]) - (*pos)[i];
+ prep_word[2 + i + hyph] = '\0';
+ }
+ }
+ hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
+ hyphens2, &rep2, &pos2, &cut2, clhmin,
+ crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
+ for (j = 0; j < i - begin; j++) {
+ hyphens[begin + j] = hyphens2[j];
+ if (rep2[j] && rep && pos && cut) {
+ if (!*rep && !*pos && !*cut) {
+ int k;
+ *rep = (char **) malloc(sizeof(char *) * word_size);
+ *pos = (int *) malloc(sizeof(int) * word_size);
+ *cut = (int *) malloc(sizeof(int) * word_size);
+ for (k = 0; k < word_size; k++) {
+ (*rep)[k] = NULL;
+ (*pos)[k] = 0;
+ (*cut)[k] = 0;
+ }
+ }
+ (*rep)[begin + j] = rep2[j];
+ (*pos)[begin + j] = pos2[j];
+ (*cut)[begin + j] = cut2[j];
+ }
+ }
+ prep_word[i + 2] = word[i + 1];
+ if (*rep && *pos && *cut && (*rep)[i]) {
+ size_t offset = 1;
+ strncpy(prep_word + offset, word, prep_word_size - offset - 1);
+ prep_word[prep_word_size - 1] = '\0';
+ }
+ }
+ begin = i + 1;
+ for (j = 0; j < word_size; j++) rep2[j] = NULL;
+ }
+
+ /* non-compound */
+ if (begin == 0) {
+ hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
+ hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
+ if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
+ rep, pos, cut, clhmin);
+ if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
+ rep, pos, cut, crhmin);
+ }
+
+ free(rep2);
+ free(cut2);
+ free(pos2);
+ free(hyphens2);
+ }
+
+ hnj_free (prep_word);
+ return 0;
+}
+
+/* UTF-8 normalization of hyphen and non-standard positions */
+int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
+ char *** rep, int ** pos, int ** cut)
+{
+ int i, j, k;
+ if ((((unsigned char) word[0]) >> 6) == 2) {
+ fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
+ return 1;
+ }
+
+ /* calculate UTF-8 character positions */
+ for (i = 0, j = -1; i < word_size; i++) {
+ /* beginning of an UTF-8 character (not '10' start bits) */
+ if ((((unsigned char) word[i]) >> 6) != 2) j++;
+ hyphens[j] = hyphens[i];
+ if (rep && pos && cut && *rep && *pos && *cut) {
+ int l = (*pos)[i];
+ (*pos)[j] = 0;
+ for (k = 0; k < l; k++) {
+ if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
+ }
+ k = i - l + 1;
+ l = k + (*cut)[i];
+ (*cut)[j] = 0;
+ for (; k < l; k++) {
+ if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
+ }
+ (*rep)[j] = (*rep)[i];
+ if (j < i) {
+ (*rep)[i] = NULL;
+ (*pos)[i] = 0;
+ (*cut)[i] = 0;
+ }
+ }
+ }
+ hyphens[j + 1] = '\0';
+#ifdef VERBOSE
+ printf ("nums: %s\n", hyphens);
+#endif
+ return 0;
+}
+
+/* get the word with all possible hyphenations (output: hyphword) */
+void hnj_hyphen_hyphword(const char * word, int word_size, const char * hyphens,
+ char * hyphword, char *** rep, int ** pos, int ** cut)
+{
+
+ if (word_size <= 0 || word_size > INT_MAX / 2) {
+ hyphword[0] = '\0';
+ return;
+ }
+
+ /* hyphword buffer size must be at least 2 * l */
+ int hyphword_size = 2 * word_size - 1;
+
+ int nonstandard = 0;
+ if (*rep && *pos && *cut) {
+ nonstandard = 1;
+ }
+
+ int i;
+ int j = 0;
+ for (i = 0; i < word_size && j < hyphword_size; i++) {
+ hyphword[j++] = word[i];
+ if (hyphens[i]&1 && j < hyphword_size) {
+ if (nonstandard && (*rep)[i] && j >= (*pos)[i]) {
+ /* non-standard */
+ j -= (*pos)[i];
+ char *s = (*rep)[i];
+ while (*s && j < hyphword_size) {
+ hyphword[j++] = *s++;
+ }
+ i += (*cut)[i] - (*pos)[i];
+ } else {
+ /* standard */
+ hyphword[j++] = '=';
+ }
+ }
+ }
+ hyphword[j] = '\0';
+}
+
+
+/* main api function with default hyphenmin parameters */
+int hnj_hyphen_hyphenate2 (HyphenDict *dict,
+ const char *word, int word_size, char * hyphens,
+ char *hyphword, char *** rep, int ** pos, int ** cut)
+{
+ hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
+ dict->clhmin, dict->crhmin, 1, 1);
+ hnj_hyphen_lhmin(dict->utf8, word, word_size,
+ hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
+ hnj_hyphen_rhmin(dict->utf8, word, word_size,
+ hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
+
+ /* nohyphen */
+ if (dict->nohyphen) {
+ char * nh = dict->nohyphen;
+ int nhi;
+ for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
+ char * nhy = (char *) strstr(word, nh);
+ while (nhy) {
+ hyphens[nhy - word + strlen(nh) - 1] = '0';
+ if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = '0';
+ nhy = (char *) strstr(nhy + 1, nh);
+ }
+ nh = nh + strlen(nh) + 1;
+ }
+ }
+
+ if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
+ if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
+#ifdef VERBOSE
+ printf ("nums: %s\n", hyphens);
+#endif
+ return 0;
+}
+
+/* previous main api function with hyphenmin parameters */
+int hnj_hyphen_hyphenate3 (HyphenDict *dict,
+ const char *word, int word_size, char * hyphens,
+ char *hyphword, char *** rep, int ** pos, int ** cut,
+ int lhmin, int rhmin, int clhmin, int crhmin)
+{
+ lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin;
+ rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin;
+ clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin;
+ crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin;
+ hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
+ clhmin, crhmin, 1, 1);
+ hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
+ rep, pos, cut, (lhmin > 0 ? lhmin : 2));
+ hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
+ rep, pos, cut, (rhmin > 0 ? rhmin : 2));
+ if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
+
+ /* nohyphen */
+ if (dict->nohyphen) {
+ char * nh = dict->nohyphen;
+ int nhi;
+ for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
+ char * nhy = (char *) strstr(word, nh);
+ while (nhy) {
+ hyphens[nhy - word + strlen(nh) - 1] = 0;
+ if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = 0;
+ nhy = (char *) strstr(nhy + 1, nh);
+ }
+ nh = nh + strlen(nh) + 1;
+ }
+ }
+
+ if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
+ return 0;
+}