diff options
Diffstat (limited to 'intl/hyphenation')
-rw-r--r-- | intl/hyphenation/README.mozilla | 13 | ||||
-rw-r--r-- | intl/hyphenation/glue/hnjalloc.h | 51 | ||||
-rw-r--r-- | intl/hyphenation/glue/hnjstdio.cpp | 119 | ||||
-rw-r--r-- | intl/hyphenation/glue/moz.build | 29 | ||||
-rw-r--r-- | intl/hyphenation/glue/nsHyphenationManager.cpp | 321 | ||||
-rw-r--r-- | intl/hyphenation/glue/nsHyphenationManager.h | 55 | ||||
-rw-r--r-- | intl/hyphenation/glue/nsHyphenator.cpp | 159 | ||||
-rw-r--r-- | intl/hyphenation/glue/nsHyphenator.h | 33 | ||||
-rw-r--r-- | intl/hyphenation/hyphen/AUTHORS | 17 | ||||
-rw-r--r-- | intl/hyphenation/hyphen/COPYING | 17 | ||||
-rw-r--r-- | intl/hyphenation/hyphen/COPYING.LGPL | 515 | ||||
-rw-r--r-- | intl/hyphenation/hyphen/COPYING.MPL | 470 | ||||
-rw-r--r-- | intl/hyphenation/hyphen/README | 134 | ||||
-rw-r--r-- | intl/hyphenation/hyphen/README.compound | 87 | ||||
-rw-r--r-- | intl/hyphenation/hyphen/README.hyphen | 108 | ||||
-rw-r--r-- | intl/hyphenation/hyphen/README.nonstandard | 122 | ||||
-rw-r--r-- | intl/hyphenation/hyphen/hyphen.c | 1187 | ||||
-rw-r--r-- | intl/hyphenation/hyphen/hyphen.h | 175 | ||||
-rw-r--r-- | intl/hyphenation/hyphen/moz.build | 19 |
19 files changed, 3631 insertions, 0 deletions
diff --git a/intl/hyphenation/README.mozilla b/intl/hyphenation/README.mozilla new file mode 100644 index 000000000..dc0718f70 --- /dev/null +++ b/intl/hyphenation/README.mozilla @@ -0,0 +1,13 @@ +About the hyphenation code in this directory +============================================ + +The hyphen directory comes from the Hyphen library, part of the hunspell project. + https://github.com/hunspell/hyphen + +This code is distributed under the GPL 2.0/LGPL 2.1/MPL 1.1 tri-license, as +detailed in the associated README and COPYING files. + +Note that we do not include other tools and resources found in the complete +Hyphen package from upstream, so the original README.* files may refer to +additional files that are not present in the Mozilla source tree. + diff --git a/intl/hyphenation/glue/hnjalloc.h b/intl/hyphenation/glue/hnjalloc.h new file mode 100644 index 000000000..fec3a4bc9 --- /dev/null +++ b/intl/hyphenation/glue/hnjalloc.h @@ -0,0 +1,51 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * Simple replacement for hnjalloc.h from libhyphen-2.x, to use moz_x* memory + * allocation functions. Note that the hyphen.c code does *NOT* check for + * NULL from memory (re)allocation, so it is essential that we use the + * "infallible" moz_x* variants here. + */ + +#include "mozilla/mozalloc.h" + +#define hnj_malloc(size) moz_xmalloc(size) +#define hnj_realloc(p, size) moz_xrealloc(p, size) +#define hnj_free(p) free(p) + +/* + * To enable us to load hyphenation dictionaries from arbitrary resource URIs, + * not just through file paths using stdio, we override the (few) stdio APIs + * that hyphen.c uses and provide our own reimplementation that calls Gecko + * i/o methods. + */ + +#include <stdio.h> /* ensure stdio.h is loaded before our macros */ + +#undef FILE +#define FILE hnjFile + +#define fopen(path,mode) hnjFopen(path,mode) +#define fclose(file) hnjFclose(file) +#define fgets(buf,count,file) hnjFgets(buf,count,file) + +typedef struct hnjFile_ hnjFile; + +#ifdef __cplusplus +extern "C" { +#endif + +hnjFile* hnjFopen(const char* aURISpec, const char* aMode); + +int hnjFclose(hnjFile* f); + +char* hnjFgets(char* s, int n, hnjFile* f); + +#ifdef __cplusplus +} +#endif + + diff --git a/intl/hyphenation/glue/hnjstdio.cpp b/intl/hyphenation/glue/hnjstdio.cpp new file mode 100644 index 000000000..660ebaf13 --- /dev/null +++ b/intl/hyphenation/glue/hnjstdio.cpp @@ -0,0 +1,119 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// This file provides substitutes for the basic stdio routines used by hyphen.c +// to read its dictionary files. We #define the stdio names to these versions +// in hnjalloc.h, so that we can use nsIURI and nsIInputStream to specify and +// access the dictionary resources. + +#include "hnjalloc.h" +#undef FILE // Undo the damage done in hnjalloc.h +#include "nsNetUtil.h" +#include "nsIInputStream.h" +#include "nsIURI.h" +#include "nsContentUtils.h" + +#define BUFSIZE 1024 + +struct hnjFile_ { + nsCOMPtr<nsIInputStream> mStream; + char mBuffer[BUFSIZE]; + uint32_t mCurPos; + uint32_t mLimit; +}; + +// replacement for fopen() +// (not a full substitute: only supports read access) +hnjFile* +hnjFopen(const char* aURISpec, const char* aMode) +{ + // this override only needs to support "r" + NS_ASSERTION(!strcmp(aMode, "r"), "unsupported fopen() mode in hnjFopen"); + + nsCOMPtr<nsIURI> uri; + nsresult rv = NS_NewURI(getter_AddRefs(uri), aURISpec); + if (NS_FAILED(rv)) { + return nullptr; + } + + nsCOMPtr<nsIChannel> channel; + rv = NS_NewChannel(getter_AddRefs(channel), + uri, + nsContentUtils::GetSystemPrincipal(), + nsILoadInfo::SEC_ALLOW_CROSS_ORIGIN_DATA_IS_NULL, + nsIContentPolicy::TYPE_OTHER); + if (NS_FAILED(rv)) { + return nullptr; + } + + nsCOMPtr<nsIInputStream> instream; + rv = channel->Open2(getter_AddRefs(instream)); + if (NS_FAILED(rv)) { + return nullptr; + } + + hnjFile *f = new hnjFile; + f->mStream = instream; + f->mCurPos = 0; + f->mLimit = 0; + + return f; +} + +// replacement for fclose() +int +hnjFclose(hnjFile* f) +{ + NS_ASSERTION(f && f->mStream, "bad argument to hnjFclose"); + + int result = 0; + nsresult rv = f->mStream->Close(); + if (NS_FAILED(rv)) { + result = EOF; + } + f->mStream = nullptr; + + delete f; + return result; +} + +// replacement for fgets() +// (not a full reimplementation, but sufficient for libhyphen's needs) +char* +hnjFgets(char* s, int n, hnjFile* f) +{ + NS_ASSERTION(s && f, "bad argument to hnjFgets"); + + int i = 0; + while (i < n - 1) { + if (f->mCurPos < f->mLimit) { + char c = f->mBuffer[f->mCurPos++]; + s[i++] = c; + if (c == '\n' || c == '\r') { + break; + } + continue; + } + + f->mCurPos = 0; + + nsresult rv = f->mStream->Read(f->mBuffer, BUFSIZE, &f->mLimit); + if (NS_FAILED(rv)) { + f->mLimit = 0; + return nullptr; + } + + if (f->mLimit == 0) { + break; + } + } + + if (i == 0) { + return nullptr; // end of file + } + + s[i] = '\0'; // null-terminate the returned string + return s; +} diff --git a/intl/hyphenation/glue/moz.build b/intl/hyphenation/glue/moz.build new file mode 100644 index 000000000..a2d71f4b5 --- /dev/null +++ b/intl/hyphenation/glue/moz.build @@ -0,0 +1,29 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +EXPORTS += [ + 'nsHyphenationManager.h', + 'nsHyphenator.h', +] + +UNIFIED_SOURCES += [ + 'nsHyphenationManager.cpp', + 'nsHyphenator.cpp', +] + +# These files cannot be built in unified mode because they include hnjalloc.h. +SOURCES += [ + 'hnjstdio.cpp', +] + +LOCAL_INCLUDES += [ + '../hyphen', +] + +FINAL_LIBRARY = 'xul' + +if CONFIG['GNU_CXX']: + CXXFLAGS += ['-Wno-error=shadow'] diff --git a/intl/hyphenation/glue/nsHyphenationManager.cpp b/intl/hyphenation/glue/nsHyphenationManager.cpp new file mode 100644 index 000000000..998550e5d --- /dev/null +++ b/intl/hyphenation/glue/nsHyphenationManager.cpp @@ -0,0 +1,321 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsHyphenationManager.h" +#include "nsHyphenator.h" +#include "nsIAtom.h" +#include "nsIFile.h" +#include "nsIURI.h" +#include "nsIProperties.h" +#include "nsISimpleEnumerator.h" +#include "nsIDirectoryEnumerator.h" +#include "nsDirectoryServiceDefs.h" +#include "nsNetUtil.h" +#include "nsUnicharUtils.h" +#include "mozilla/Preferences.h" +#include "nsZipArchive.h" +#include "mozilla/Services.h" +#include "nsIObserverService.h" +#include "nsCRT.h" +#include "nsAppDirectoryServiceDefs.h" +#include "nsDirectoryServiceUtils.h" + +using namespace mozilla; + +static const char kIntlHyphenationAliasPrefix[] = "intl.hyphenation-alias."; +static const char kMemoryPressureNotification[] = "memory-pressure"; + +nsHyphenationManager *nsHyphenationManager::sInstance = nullptr; + +NS_IMPL_ISUPPORTS(nsHyphenationManager::MemoryPressureObserver, + nsIObserver) + +NS_IMETHODIMP +nsHyphenationManager::MemoryPressureObserver::Observe(nsISupports *aSubject, + const char *aTopic, + const char16_t *aData) +{ + if (!nsCRT::strcmp(aTopic, kMemoryPressureNotification)) { + // We don't call Instance() here, as we don't want to create a hyphenation + // manager if there isn't already one in existence. + // (This observer class is local to the hyphenation manager, so it can use + // the protected members directly.) + if (nsHyphenationManager::sInstance) { + nsHyphenationManager::sInstance->mHyphenators.Clear(); + } + } + return NS_OK; +} + +nsHyphenationManager* +nsHyphenationManager::Instance() +{ + if (sInstance == nullptr) { + sInstance = new nsHyphenationManager(); + + nsCOMPtr<nsIObserverService> obs = mozilla::services::GetObserverService(); + if (obs) { + obs->AddObserver(new MemoryPressureObserver, + kMemoryPressureNotification, false); + } + } + return sInstance; +} + +void +nsHyphenationManager::Shutdown() +{ + delete sInstance; + sInstance = nullptr; +} + +nsHyphenationManager::nsHyphenationManager() +{ + LoadPatternList(); + LoadAliases(); +} + +nsHyphenationManager::~nsHyphenationManager() +{ + sInstance = nullptr; +} + +already_AddRefed<nsHyphenator> +nsHyphenationManager::GetHyphenator(nsIAtom *aLocale) +{ + RefPtr<nsHyphenator> hyph; + mHyphenators.Get(aLocale, getter_AddRefs(hyph)); + if (hyph) { + return hyph.forget(); + } + nsCOMPtr<nsIURI> uri = mPatternFiles.Get(aLocale); + if (!uri) { + nsCOMPtr<nsIAtom> alias = mHyphAliases.Get(aLocale); + if (alias) { + mHyphenators.Get(alias, getter_AddRefs(hyph)); + if (hyph) { + return hyph.forget(); + } + uri = mPatternFiles.Get(alias); + if (uri) { + aLocale = alias; + } + } + if (!uri) { + // In the case of a locale such as "de-DE-1996", we try replacing + // successive trailing subtags with "-*" to find fallback patterns, + // so "de-DE-1996" -> "de-DE-*" (and then recursively -> "de-*") + nsAtomCString localeStr(aLocale); + if (StringEndsWith(localeStr, NS_LITERAL_CSTRING("-*"))) { + localeStr.Truncate(localeStr.Length() - 2); + } + int32_t i = localeStr.RFindChar('-'); + if (i > 1) { + localeStr.Replace(i, localeStr.Length() - i, "-*"); + nsCOMPtr<nsIAtom> fuzzyLocale = NS_Atomize(localeStr); + return GetHyphenator(fuzzyLocale); + } else { + return nullptr; + } + } + } + hyph = new nsHyphenator(uri); + if (hyph->IsValid()) { + mHyphenators.Put(aLocale, hyph); + return hyph.forget(); + } +#ifdef DEBUG + nsCString msg("failed to load patterns from "); + msg += uri->GetSpecOrDefault(); + NS_WARNING(msg.get()); +#endif + mPatternFiles.Remove(aLocale); + return nullptr; +} + +void +nsHyphenationManager::LoadPatternList() +{ + mPatternFiles.Clear(); + mHyphenators.Clear(); + + LoadPatternListFromOmnijar(Omnijar::GRE); + LoadPatternListFromOmnijar(Omnijar::APP); + + nsCOMPtr<nsIProperties> dirSvc = + do_GetService(NS_DIRECTORY_SERVICE_CONTRACTID); + if (!dirSvc) { + return; + } + + nsresult rv; + nsCOMPtr<nsIFile> greDir; + rv = dirSvc->Get(NS_GRE_DIR, + NS_GET_IID(nsIFile), getter_AddRefs(greDir)); + if (NS_SUCCEEDED(rv)) { + greDir->AppendNative(NS_LITERAL_CSTRING("hyphenation")); + LoadPatternListFromDir(greDir); + } + + nsCOMPtr<nsIFile> appDir; + rv = dirSvc->Get(NS_XPCOM_CURRENT_PROCESS_DIR, + NS_GET_IID(nsIFile), getter_AddRefs(appDir)); + if (NS_SUCCEEDED(rv)) { + appDir->AppendNative(NS_LITERAL_CSTRING("hyphenation")); + bool equals; + if (NS_SUCCEEDED(appDir->Equals(greDir, &equals)) && !equals) { + LoadPatternListFromDir(appDir); + } + } + + nsCOMPtr<nsIFile> profileDir; + rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_LOCAL_50_DIR, + getter_AddRefs(profileDir)); + if (NS_SUCCEEDED(rv)) { + profileDir->AppendNative(NS_LITERAL_CSTRING("hyphenation")); + LoadPatternListFromDir(profileDir); + } +} + +void +nsHyphenationManager::LoadPatternListFromOmnijar(Omnijar::Type aType) +{ + nsCString base; + nsresult rv = Omnijar::GetURIString(aType, base); + if (NS_FAILED(rv)) { + return; + } + + RefPtr<nsZipArchive> zip = Omnijar::GetReader(aType); + if (!zip) { + return; + } + + nsZipFind *find; + zip->FindInit("hyphenation/hyph_*.dic", &find); + if (!find) { + return; + } + + const char *result; + uint16_t len; + while (NS_SUCCEEDED(find->FindNext(&result, &len))) { + nsCString uriString(base); + uriString.Append(result, len); + nsCOMPtr<nsIURI> uri; + rv = NS_NewURI(getter_AddRefs(uri), uriString); + if (NS_FAILED(rv)) { + continue; + } + nsCString locale; + rv = uri->GetPath(locale); + if (NS_FAILED(rv)) { + continue; + } + ToLowerCase(locale); + locale.SetLength(locale.Length() - 4); // strip ".dic" + locale.Cut(0, locale.RFindChar('/') + 1); // strip directory + if (StringBeginsWith(locale, NS_LITERAL_CSTRING("hyph_"))) { + locale.Cut(0, 5); + } + for (uint32_t i = 0; i < locale.Length(); ++i) { + if (locale[i] == '_') { + locale.Replace(i, 1, '-'); + } + } + nsCOMPtr<nsIAtom> localeAtom = NS_Atomize(locale); + if (NS_SUCCEEDED(rv)) { + mPatternFiles.Put(localeAtom, uri); + } + } + + delete find; +} + +void +nsHyphenationManager::LoadPatternListFromDir(nsIFile *aDir) +{ + nsresult rv; + + bool check = false; + rv = aDir->Exists(&check); + if (NS_FAILED(rv) || !check) { + return; + } + + rv = aDir->IsDirectory(&check); + if (NS_FAILED(rv) || !check) { + return; + } + + nsCOMPtr<nsISimpleEnumerator> e; + rv = aDir->GetDirectoryEntries(getter_AddRefs(e)); + if (NS_FAILED(rv)) { + return; + } + + nsCOMPtr<nsIDirectoryEnumerator> files(do_QueryInterface(e)); + if (!files) { + return; + } + + nsCOMPtr<nsIFile> file; + while (NS_SUCCEEDED(files->GetNextFile(getter_AddRefs(file))) && file){ + nsAutoString dictName; + file->GetLeafName(dictName); + NS_ConvertUTF16toUTF8 locale(dictName); + ToLowerCase(locale); + if (!StringEndsWith(locale, NS_LITERAL_CSTRING(".dic"))) { + continue; + } + if (StringBeginsWith(locale, NS_LITERAL_CSTRING("hyph_"))) { + locale.Cut(0, 5); + } + locale.SetLength(locale.Length() - 4); // strip ".dic" + for (uint32_t i = 0; i < locale.Length(); ++i) { + if (locale[i] == '_') { + locale.Replace(i, 1, '-'); + } + } +#ifdef DEBUG_hyph + printf("adding hyphenation patterns for %s: %s\n", locale.get(), + NS_ConvertUTF16toUTF8(dictName).get()); +#endif + nsCOMPtr<nsIAtom> localeAtom = NS_Atomize(locale); + nsCOMPtr<nsIURI> uri; + nsresult rv = NS_NewFileURI(getter_AddRefs(uri), file); + if (NS_SUCCEEDED(rv)) { + mPatternFiles.Put(localeAtom, uri); + } + } +} + +void +nsHyphenationManager::LoadAliases() +{ + nsIPrefBranch* prefRootBranch = Preferences::GetRootBranch(); + if (!prefRootBranch) { + return; + } + uint32_t prefCount; + char **prefNames; + nsresult rv = prefRootBranch->GetChildList(kIntlHyphenationAliasPrefix, + &prefCount, &prefNames); + if (NS_SUCCEEDED(rv) && prefCount > 0) { + for (uint32_t i = 0; i < prefCount; ++i) { + nsAdoptingCString value = Preferences::GetCString(prefNames[i]); + if (value) { + nsAutoCString alias(prefNames[i]); + alias.Cut(0, sizeof(kIntlHyphenationAliasPrefix) - 1); + ToLowerCase(alias); + ToLowerCase(value); + nsCOMPtr<nsIAtom> aliasAtom = NS_Atomize(alias); + nsCOMPtr<nsIAtom> valueAtom = NS_Atomize(value); + mHyphAliases.Put(aliasAtom, valueAtom); + } + } + NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(prefCount, prefNames); + } +} diff --git a/intl/hyphenation/glue/nsHyphenationManager.h b/intl/hyphenation/glue/nsHyphenationManager.h new file mode 100644 index 000000000..fa7d73f18 --- /dev/null +++ b/intl/hyphenation/glue/nsHyphenationManager.h @@ -0,0 +1,55 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsHyphenationManager_h__ +#define nsHyphenationManager_h__ + +#include "nsInterfaceHashtable.h" +#include "nsRefPtrHashtable.h" +#include "nsHashKeys.h" +#include "nsIObserver.h" +#include "mozilla/Omnijar.h" + +class nsHyphenator; +class nsIAtom; +class nsIURI; + +class nsHyphenationManager +{ +public: + nsHyphenationManager(); + + already_AddRefed<nsHyphenator> GetHyphenator(nsIAtom *aLocale); + + static nsHyphenationManager *Instance(); + + static void Shutdown(); + +private: + ~nsHyphenationManager(); + +protected: + class MemoryPressureObserver final : public nsIObserver + { + ~MemoryPressureObserver() {} + + public: + NS_DECL_ISUPPORTS + NS_DECL_NSIOBSERVER + }; + + void LoadPatternList(); + void LoadPatternListFromOmnijar(mozilla::Omnijar::Type aType); + void LoadPatternListFromDir(nsIFile *aDir); + void LoadAliases(); + + nsInterfaceHashtable<nsISupportsHashKey,nsIAtom> mHyphAliases; + nsInterfaceHashtable<nsISupportsHashKey,nsIURI> mPatternFiles; + nsRefPtrHashtable<nsISupportsHashKey,nsHyphenator> mHyphenators; + + static nsHyphenationManager *sInstance; +}; + +#endif // nsHyphenationManager_h__ diff --git a/intl/hyphenation/glue/nsHyphenator.cpp b/intl/hyphenation/glue/nsHyphenator.cpp new file mode 100644 index 000000000..bcb87baf6 --- /dev/null +++ b/intl/hyphenation/glue/nsHyphenator.cpp @@ -0,0 +1,159 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsHyphenator.h" +#include "nsIFile.h" +#include "nsUTF8Utils.h" +#include "nsUnicodeProperties.h" +#include "nsUnicharUtilCIID.h" +#include "nsIURI.h" + +#include "hyphen.h" + +nsHyphenator::nsHyphenator(nsIURI *aURI) + : mDict(nullptr) +{ + nsCString uriSpec; + nsresult rv = aURI->GetSpec(uriSpec); + if (NS_FAILED(rv)) { + return; + } + mDict = hnj_hyphen_load(uriSpec.get()); +#ifdef DEBUG + if (mDict) { + printf("loaded hyphenation patterns from %s\n", uriSpec.get()); + } +#endif +} + +nsHyphenator::~nsHyphenator() +{ + if (mDict != nullptr) { + hnj_hyphen_free((HyphenDict*)mDict); + mDict = nullptr; + } +} + +bool +nsHyphenator::IsValid() +{ + return (mDict != nullptr); +} + +nsresult +nsHyphenator::Hyphenate(const nsAString& aString, nsTArray<bool>& aHyphens) +{ + if (!aHyphens.SetLength(aString.Length(), mozilla::fallible)) { + return NS_ERROR_OUT_OF_MEMORY; + } + memset(aHyphens.Elements(), false, aHyphens.Length() * sizeof(bool)); + + bool inWord = false; + uint32_t wordStart = 0, wordLimit = 0; + uint32_t chLen; + for (uint32_t i = 0; i < aString.Length(); i += chLen) { + uint32_t ch = aString[i]; + chLen = 1; + + if (NS_IS_HIGH_SURROGATE(ch)) { + if (i + 1 < aString.Length() && NS_IS_LOW_SURROGATE(aString[i+1])) { + ch = SURROGATE_TO_UCS4(ch, aString[i+1]); + chLen = 2; + } else { + NS_WARNING("unpaired surrogate found during hyphenation"); + } + } + + nsIUGenCategory::nsUGenCategory cat = mozilla::unicode::GetGenCategory(ch); + if (cat == nsIUGenCategory::kLetter || cat == nsIUGenCategory::kMark) { + if (!inWord) { + inWord = true; + wordStart = i; + } + wordLimit = i + chLen; + if (i + chLen < aString.Length()) { + continue; + } + } + + if (inWord) { + // Convert the word to utf-8 for libhyphen, lowercasing it as we go + // so that it will match the (lowercased) patterns (bug 1105644). + nsAutoCString utf8; + const char16_t* const begin = aString.BeginReading(); + const char16_t *cur = begin + wordStart; + const char16_t *end = begin + wordLimit; + while (cur < end) { + uint32_t ch = *cur++; + + if (NS_IS_HIGH_SURROGATE(ch)) { + if (cur < end && NS_IS_LOW_SURROGATE(*cur)) { + ch = SURROGATE_TO_UCS4(ch, *cur++); + } else { + ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR + } + } else if (NS_IS_LOW_SURROGATE(ch)) { + ch = 0xfffd; // unpaired surrogate + } + + // XXX What about language-specific casing? Consider Turkish I/i... + // In practice, it looks like the current patterns will not be + // affected by this, as they treat dotted and undotted i similarly. + ch = ToLowerCase(ch); + + if (ch < 0x80) { // U+0000 - U+007F + utf8.Append(ch); + } else if (ch < 0x0800) { // U+0100 - U+07FF + utf8.Append(0xC0 | (ch >> 6)); + utf8.Append(0x80 | (0x003F & ch)); + } else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF + utf8.Append(0xE0 | (ch >> 12)); + utf8.Append(0x80 | (0x003F & (ch >> 6))); + utf8.Append(0x80 | (0x003F & ch)); + } else { + utf8.Append(0xF0 | (ch >> 18)); + utf8.Append(0x80 | (0x003F & (ch >> 12))); + utf8.Append(0x80 | (0x003F & (ch >> 6))); + utf8.Append(0x80 | (0x003F & ch)); + } + } + + AutoTArray<char,200> utf8hyphens; + utf8hyphens.SetLength(utf8.Length() + 5); + char **rep = nullptr; + int *pos = nullptr; + int *cut = nullptr; + int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict, + utf8.BeginReading(), utf8.Length(), + utf8hyphens.Elements(), nullptr, + &rep, &pos, &cut); + if (!err) { + // Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer + // from utf8 code unit indexing (which would match the utf8 input + // string directly) to Unicode character indexing. + // We then need to convert this to utf16 code unit offsets for Gecko. + const char *hyphPtr = utf8hyphens.Elements(); + const char16_t *cur = begin + wordStart; + const char16_t *end = begin + wordLimit; + while (cur < end) { + if (*hyphPtr & 0x01) { + aHyphens[cur - begin] = true; + } + cur++; + if (cur < end && NS_IS_LOW_SURROGATE(*cur) && + NS_IS_HIGH_SURROGATE(*(cur-1))) + { + cur++; + } + hyphPtr++; + } + } + } + + inWord = false; + } + + return NS_OK; +} diff --git a/intl/hyphenation/glue/nsHyphenator.h b/intl/hyphenation/glue/nsHyphenator.h new file mode 100644 index 000000000..96975d253 --- /dev/null +++ b/intl/hyphenation/glue/nsHyphenator.h @@ -0,0 +1,33 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsHyphenator_h__ +#define nsHyphenator_h__ + +#include "nsCOMPtr.h" +#include "nsString.h" +#include "nsTArray.h" + +class nsIURI; + +class nsHyphenator +{ +public: + explicit nsHyphenator(nsIURI *aURI); + + NS_INLINE_DECL_REFCOUNTING(nsHyphenator) + + bool IsValid(); + + nsresult Hyphenate(const nsAString& aText, nsTArray<bool>& aHyphens); + +private: + ~nsHyphenator(); + +protected: + void *mDict; +}; + +#endif // nsHyphenator_h__ diff --git a/intl/hyphenation/hyphen/AUTHORS b/intl/hyphenation/hyphen/AUTHORS new file mode 100644 index 000000000..e1e0f3c84 --- /dev/null +++ b/intl/hyphenation/hyphen/AUTHORS @@ -0,0 +1,17 @@ +Libhnj was written by Raph Levien <raph at acm dot org>. + +Original Libhnj source with OOo's patches are managed by Rene Engelhard and +Chris Halls at Debian: http://packages.debian.org/stable/libdevel/libhnj-dev +and http://packages.debian.org/unstable/source/libhnj + +This subset of Libhnj was extended by +Peter Novodvorsky <nidd at alt-linux dot org> (OOo integration), +László Németh <nemeth at numbertext dot org> (non-standard and compound +hyphenation with Unicode support), +Nanning Buitenhuis <nanning at elvenkind dot com> (substrings.c) + +Write bug reports to László Németh or in the bug tracker of hunspell.sf.net. + +--- +Please contact Raph Levien for information about licensing for +proprietary applications. diff --git a/intl/hyphenation/hyphen/COPYING b/intl/hyphenation/hyphen/COPYING new file mode 100644 index 000000000..4c278cb77 --- /dev/null +++ b/intl/hyphenation/hyphen/COPYING @@ -0,0 +1,17 @@ +GPL 2.0/LGPL 2.1/MPL 1.1 tri-license + +The contents of this software may be used under the terms of +the GNU General Public License Version 2 or later (the "GPL"), or +the GNU Lesser General Public License Version 2.1 or later (the "LGPL", +see COPYING.LGPL) or the Mozilla Public License Version 1.1 or later +(the "MPL", see COPYING.MPL). + +The Plain TeX hyphenation tables "hyphen.tex" by Donald E. Knuth +has a non MPL/LGPL compatible license, but freely redistributable: +"Unlimited copying and redistribution of this file are permitted as long +as this file is not modified. Modifications are permitted, but only if +the resulting file is not named hyphen.tex." + +Software distributed under these licenses is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the licences +for the specific language governing rights and limitations under the licenses. diff --git a/intl/hyphenation/hyphen/COPYING.LGPL b/intl/hyphenation/hyphen/COPYING.LGPL new file mode 100644 index 000000000..c4792dd27 --- /dev/null +++ b/intl/hyphenation/hyphen/COPYING.LGPL @@ -0,0 +1,515 @@ + + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations +below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. +^L + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it +becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. +^L + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control +compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. +^L + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. +^L + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. +^L + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. +^L + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply, and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License +may add an explicit geographical distribution limitation excluding those +countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. +^L + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS +^L + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms +of the ordinary General Public License). + + To apply these terms, attach the following notices to the library. +It is safest to attach them to the start of each source file to most +effectively convey the exclusion of warranty; and each file should +have at least the "copyright" line and a pointer to where the full +notice is found. + + + <one line to give the library's name and a brief idea of what it +does.> + Copyright (C) <year> <name of author> + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +Also add information on how to contact you by electronic and paper +mail. + +You should also get your employer (if you work as a programmer) or +your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James +Random Hacker. + + <signature of Ty Coon>, 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff --git a/intl/hyphenation/hyphen/COPYING.MPL b/intl/hyphenation/hyphen/COPYING.MPL new file mode 100644 index 000000000..7714141d1 --- /dev/null +++ b/intl/hyphenation/hyphen/COPYING.MPL @@ -0,0 +1,470 @@ + MOZILLA PUBLIC LICENSE + Version 1.1 + + --------------- + +1. Definitions. + + 1.0.1. "Commercial Use" means distribution or otherwise making the + Covered Code available to a third party. + + 1.1. "Contributor" means each entity that creates or contributes to + the creation of Modifications. + + 1.2. "Contributor Version" means the combination of the Original + Code, prior Modifications used by a Contributor, and the Modifications + made by that particular Contributor. + + 1.3. "Covered Code" means the Original Code or Modifications or the + combination of the Original Code and Modifications, in each case + including portions thereof. + + 1.4. "Electronic Distribution Mechanism" means a mechanism generally + accepted in the software development community for the electronic + transfer of data. + + 1.5. "Executable" means Covered Code in any form other than Source + Code. + + 1.6. "Initial Developer" means the individual or entity identified + as the Initial Developer in the Source Code notice required by Exhibit + A. + + 1.7. "Larger Work" means a work which combines Covered Code or + portions thereof with code not governed by the terms of this License. + + 1.8. "License" means this document. + + 1.8.1. "Licensable" means having the right to grant, to the maximum + extent possible, whether at the time of the initial grant or + subsequently acquired, any and all of the rights conveyed herein. + + 1.9. "Modifications" means any addition to or deletion from the + substance or structure of either the Original Code or any previous + Modifications. When Covered Code is released as a series of files, a + Modification is: + A. Any addition to or deletion from the contents of a file + containing Original Code or previous Modifications. + + B. Any new file that contains any part of the Original Code or + previous Modifications. + + 1.10. "Original Code" means Source Code of computer software code + which is described in the Source Code notice required by Exhibit A as + Original Code, and which, at the time of its release under this + License is not already Covered Code governed by this License. + + 1.10.1. "Patent Claims" means any patent claim(s), now owned or + hereafter acquired, including without limitation, method, process, + and apparatus claims, in any patent Licensable by grantor. + + 1.11. "Source Code" means the preferred form of the Covered Code for + making modifications to it, including all modules it contains, plus + any associated interface definition files, scripts used to control + compilation and installation of an Executable, or source code + differential comparisons against either the Original Code or another + well known, available Covered Code of the Contributor's choice. The + Source Code can be in a compressed or archival form, provided the + appropriate decompression or de-archiving software is widely available + for no charge. + + 1.12. "You" (or "Your") means an individual or a legal entity + exercising rights under, and complying with all of the terms of, this + License or a future version of this License issued under Section 6.1. + For legal entities, "You" includes any entity which controls, is + controlled by, or is under common control with You. For purposes of + this definition, "control" means (a) the power, direct or indirect, + to cause the direction or management of such entity, whether by + contract or otherwise, or (b) ownership of more than fifty percent + (50%) of the outstanding shares or beneficial ownership of such + entity. + +2. Source Code License. + + 2.1. The Initial Developer Grant. + The Initial Developer hereby grants You a world-wide, royalty-free, + non-exclusive license, subject to third party intellectual property + claims: + (a) under intellectual property rights (other than patent or + trademark) Licensable by Initial Developer to use, reproduce, + modify, display, perform, sublicense and distribute the Original + Code (or portions thereof) with or without Modifications, and/or + as part of a Larger Work; and + + (b) under Patents Claims infringed by the making, using or + selling of Original Code, to make, have made, use, practice, + sell, and offer for sale, and/or otherwise dispose of the + Original Code (or portions thereof). + + (c) the licenses granted in this Section 2.1(a) and (b) are + effective on the date Initial Developer first distributes + Original Code under the terms of this License. + + (d) Notwithstanding Section 2.1(b) above, no patent license is + granted: 1) for code that You delete from the Original Code; 2) + separate from the Original Code; or 3) for infringements caused + by: i) the modification of the Original Code or ii) the + combination of the Original Code with other software or devices. + + 2.2. Contributor Grant. + Subject to third party intellectual property claims, each Contributor + hereby grants You a world-wide, royalty-free, non-exclusive license + + (a) under intellectual property rights (other than patent or + trademark) Licensable by Contributor, to use, reproduce, modify, + display, perform, sublicense and distribute the Modifications + created by such Contributor (or portions thereof) either on an + unmodified basis, with other Modifications, as Covered Code + and/or as part of a Larger Work; and + + (b) under Patent Claims infringed by the making, using, or + selling of Modifications made by that Contributor either alone + and/or in combination with its Contributor Version (or portions + of such combination), to make, use, sell, offer for sale, have + made, and/or otherwise dispose of: 1) Modifications made by that + Contributor (or portions thereof); and 2) the combination of + Modifications made by that Contributor with its Contributor + Version (or portions of such combination). + + (c) the licenses granted in Sections 2.2(a) and 2.2(b) are + effective on the date Contributor first makes Commercial Use of + the Covered Code. + + (d) Notwithstanding Section 2.2(b) above, no patent license is + granted: 1) for any code that Contributor has deleted from the + Contributor Version; 2) separate from the Contributor Version; + 3) for infringements caused by: i) third party modifications of + Contributor Version or ii) the combination of Modifications made + by that Contributor with other software (except as part of the + Contributor Version) or other devices; or 4) under Patent Claims + infringed by Covered Code in the absence of Modifications made by + that Contributor. + +3. Distribution Obligations. + + 3.1. Application of License. + The Modifications which You create or to which You contribute are + governed by the terms of this License, including without limitation + Section 2.2. The Source Code version of Covered Code may be + distributed only under the terms of this License or a future version + of this License released under Section 6.1, and You must include a + copy of this License with every copy of the Source Code You + distribute. You may not offer or impose any terms on any Source Code + version that alters or restricts the applicable version of this + License or the recipients' rights hereunder. However, You may include + an additional document offering the additional rights described in + Section 3.5. + + 3.2. Availability of Source Code. + Any Modification which You create or to which You contribute must be + made available in Source Code form under the terms of this License + either on the same media as an Executable version or via an accepted + Electronic Distribution Mechanism to anyone to whom you made an + Executable version available; and if made available via Electronic + Distribution Mechanism, must remain available for at least twelve (12) + months after the date it initially became available, or at least six + (6) months after a subsequent version of that particular Modification + has been made available to such recipients. You are responsible for + ensuring that the Source Code version remains available even if the + Electronic Distribution Mechanism is maintained by a third party. + + 3.3. Description of Modifications. + You must cause all Covered Code to which You contribute to contain a + file documenting the changes You made to create that Covered Code and + the date of any change. You must include a prominent statement that + the Modification is derived, directly or indirectly, from Original + Code provided by the Initial Developer and including the name of the + Initial Developer in (a) the Source Code, and (b) in any notice in an + Executable version or related documentation in which You describe the + origin or ownership of the Covered Code. + + 3.4. Intellectual Property Matters + (a) Third Party Claims. + If Contributor has knowledge that a license under a third party's + intellectual property rights is required to exercise the rights + granted by such Contributor under Sections 2.1 or 2.2, + Contributor must include a text file with the Source Code + distribution titled "LEGAL" which describes the claim and the + party making the claim in sufficient detail that a recipient will + know whom to contact. If Contributor obtains such knowledge after + the Modification is made available as described in Section 3.2, + Contributor shall promptly modify the LEGAL file in all copies + Contributor makes available thereafter and shall take other steps + (such as notifying appropriate mailing lists or newsgroups) + reasonably calculated to inform those who received the Covered + Code that new knowledge has been obtained. + + (b) Contributor APIs. + If Contributor's Modifications include an application programming + interface and Contributor has knowledge of patent licenses which + are reasonably necessary to implement that API, Contributor must + also include this information in the LEGAL file. + + (c) Representations. + Contributor represents that, except as disclosed pursuant to + Section 3.4(a) above, Contributor believes that Contributor's + Modifications are Contributor's original creation(s) and/or + Contributor has sufficient rights to grant the rights conveyed by + this License. + + 3.5. Required Notices. + You must duplicate the notice in Exhibit A in each file of the Source + Code. If it is not possible to put such notice in a particular Source + Code file due to its structure, then You must include such notice in a + location (such as a relevant directory) where a user would be likely + to look for such a notice. If You created one or more Modification(s) + You may add your name as a Contributor to the notice described in + Exhibit A. You must also duplicate this License in any documentation + for the Source Code where You describe recipients' rights or ownership + rights relating to Covered Code. You may choose to offer, and to + charge a fee for, warranty, support, indemnity or liability + obligations to one or more recipients of Covered Code. However, You + may do so only on Your own behalf, and not on behalf of the Initial + Developer or any Contributor. You must make it absolutely clear than + any such warranty, support, indemnity or liability obligation is + offered by You alone, and You hereby agree to indemnify the Initial + Developer and every Contributor for any liability incurred by the + Initial Developer or such Contributor as a result of warranty, + support, indemnity or liability terms You offer. + + 3.6. Distribution of Executable Versions. + You may distribute Covered Code in Executable form only if the + requirements of Section 3.1-3.5 have been met for that Covered Code, + and if You include a notice stating that the Source Code version of + the Covered Code is available under the terms of this License, + including a description of how and where You have fulfilled the + obligations of Section 3.2. The notice must be conspicuously included + in any notice in an Executable version, related documentation or + collateral in which You describe recipients' rights relating to the + Covered Code. You may distribute the Executable version of Covered + Code or ownership rights under a license of Your choice, which may + contain terms different from this License, provided that You are in + compliance with the terms of this License and that the license for the + Executable version does not attempt to limit or alter the recipient's + rights in the Source Code version from the rights set forth in this + License. If You distribute the Executable version under a different + license You must make it absolutely clear that any terms which differ + from this License are offered by You alone, not by the Initial + Developer or any Contributor. You hereby agree to indemnify the + Initial Developer and every Contributor for any liability incurred by + the Initial Developer or such Contributor as a result of any such + terms You offer. + + 3.7. Larger Works. + You may create a Larger Work by combining Covered Code with other code + not governed by the terms of this License and distribute the Larger + Work as a single product. In such a case, You must make sure the + requirements of this License are fulfilled for the Covered Code. + +4. Inability to Comply Due to Statute or Regulation. + + If it is impossible for You to comply with any of the terms of this + License with respect to some or all of the Covered Code due to + statute, judicial order, or regulation then You must: (a) comply with + the terms of this License to the maximum extent possible; and (b) + describe the limitations and the code they affect. Such description + must be included in the LEGAL file described in Section 3.4 and must + be included with all distributions of the Source Code. Except to the + extent prohibited by statute or regulation, such description must be + sufficiently detailed for a recipient of ordinary skill to be able to + understand it. + +5. Application of this License. + + This License applies to code to which the Initial Developer has + attached the notice in Exhibit A and to related Covered Code. + +6. Versions of the License. + + 6.1. New Versions. + Netscape Communications Corporation ("Netscape") may publish revised + and/or new versions of the License from time to time. Each version + will be given a distinguishing version number. + + 6.2. Effect of New Versions. + Once Covered Code has been published under a particular version of the + License, You may always continue to use it under the terms of that + version. You may also choose to use such Covered Code under the terms + of any subsequent version of the License published by Netscape. No one + other than Netscape has the right to modify the terms applicable to + Covered Code created under this License. + + 6.3. Derivative Works. + If You create or use a modified version of this License (which you may + only do in order to apply it to code which is not already Covered Code + governed by this License), You must (a) rename Your license so that + the phrases "Mozilla", "MOZILLAPL", "MOZPL", "Netscape", + "MPL", "NPL" or any confusingly similar phrase do not appear in your + license (except to note that your license differs from this License) + and (b) otherwise make it clear that Your version of the license + contains terms which differ from the Mozilla Public License and + Netscape Public License. (Filling in the name of the Initial + Developer, Original Code or Contributor in the notice described in + Exhibit A shall not of themselves be deemed to be modifications of + this License.) + +7. DISCLAIMER OF WARRANTY. + + COVERED CODE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS, + WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + WITHOUT LIMITATION, WARRANTIES THAT THE COVERED CODE IS FREE OF + DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING. + THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE COVERED CODE + IS WITH YOU. SHOULD ANY COVERED CODE PROVE DEFECTIVE IN ANY RESPECT, + YOU (NOT THE INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE + COST OF ANY NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER + OF WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF + ANY COVERED CODE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS DISCLAIMER. + +8. TERMINATION. + + 8.1. This License and the rights granted hereunder will terminate + automatically if You fail to comply with terms herein and fail to cure + such breach within 30 days of becoming aware of the breach. All + sublicenses to the Covered Code which are properly granted shall + survive any termination of this License. Provisions which, by their + nature, must remain in effect beyond the termination of this License + shall survive. + + 8.2. If You initiate litigation by asserting a patent infringement + claim (excluding declatory judgment actions) against Initial Developer + or a Contributor (the Initial Developer or Contributor against whom + You file such action is referred to as "Participant") alleging that: + + (a) such Participant's Contributor Version directly or indirectly + infringes any patent, then any and all rights granted by such + Participant to You under Sections 2.1 and/or 2.2 of this License + shall, upon 60 days notice from Participant terminate prospectively, + unless if within 60 days after receipt of notice You either: (i) + agree in writing to pay Participant a mutually agreeable reasonable + royalty for Your past and future use of Modifications made by such + Participant, or (ii) withdraw Your litigation claim with respect to + the Contributor Version against such Participant. If within 60 days + of notice, a reasonable royalty and payment arrangement are not + mutually agreed upon in writing by the parties or the litigation claim + is not withdrawn, the rights granted by Participant to You under + Sections 2.1 and/or 2.2 automatically terminate at the expiration of + the 60 day notice period specified above. + + (b) any software, hardware, or device, other than such Participant's + Contributor Version, directly or indirectly infringes any patent, then + any rights granted to You by such Participant under Sections 2.1(b) + and 2.2(b) are revoked effective as of the date You first made, used, + sold, distributed, or had made, Modifications made by that + Participant. + + 8.3. If You assert a patent infringement claim against Participant + alleging that such Participant's Contributor Version directly or + indirectly infringes any patent where such claim is resolved (such as + by license or settlement) prior to the initiation of patent + infringement litigation, then the reasonable value of the licenses + granted by such Participant under Sections 2.1 or 2.2 shall be taken + into account in determining the amount or value of any payment or + license. + + 8.4. In the event of termination under Sections 8.1 or 8.2 above, + all end user license agreements (excluding distributors and resellers) + which have been validly granted by You or any distributor hereunder + prior to termination shall survive termination. + +9. LIMITATION OF LIABILITY. + + UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT + (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE INITIAL + DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED CODE, + OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE TO ANY PERSON FOR + ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY + CHARACTER INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF GOODWILL, + WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER + COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN + INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF + LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL INJURY + RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW + PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE + EXCLUSION OR LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO + THIS EXCLUSION AND LIMITATION MAY NOT APPLY TO YOU. + +10. U.S. GOVERNMENT END USERS. + + The Covered Code is a "commercial item," as that term is defined in + 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer + software" and "commercial computer software documentation," as such + terms are used in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48 + C.F.R. 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995), + all U.S. Government End Users acquire Covered Code with only those + rights set forth herein. + +11. MISCELLANEOUS. + + This License represents the complete agreement concerning subject + matter hereof. If any provision of this License is held to be + unenforceable, such provision shall be reformed only to the extent + necessary to make it enforceable. This License shall be governed by + California law provisions (except to the extent applicable law, if + any, provides otherwise), excluding its conflict-of-law provisions. + With respect to disputes in which at least one party is a citizen of, + or an entity chartered or registered to do business in the United + States of America, any litigation relating to this License shall be + subject to the jurisdiction of the Federal Courts of the Northern + District of California, with venue lying in Santa Clara County, + California, with the losing party responsible for costs, including + without limitation, court costs and reasonable attorneys' fees and + expenses. The application of the United Nations Convention on + Contracts for the International Sale of Goods is expressly excluded. + Any law or regulation which provides that the language of a contract + shall be construed against the drafter shall not apply to this + License. + +12. RESPONSIBILITY FOR CLAIMS. + + As between Initial Developer and the Contributors, each party is + responsible for claims and damages arising, directly or indirectly, + out of its utilization of rights under this License and You agree to + work with Initial Developer and Contributors to distribute such + responsibility on an equitable basis. Nothing herein is intended or + shall be deemed to constitute any admission of liability. + +13. MULTIPLE-LICENSED CODE. + + Initial Developer may designate portions of the Covered Code as + "Multiple-Licensed". "Multiple-Licensed" means that the Initial + Developer permits you to utilize portions of the Covered Code under + Your choice of the NPL or the alternative licenses, if any, specified + by the Initial Developer in the file described in Exhibit A. + +EXHIBIT A -Mozilla Public License. + + ``The contents of this file are subject to the Mozilla Public License + Version 1.1 (the "License"); you may not use this file except in + compliance with the License. You may obtain a copy of the License at + http://www.mozilla.org/MPL/ + + Software distributed under the License is distributed on an "AS IS" + basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the + License for the specific language governing rights and limitations + under the License. + + The Original Code is ______________________________________. + + The Initial Developer of the Original Code is ________________________. + Portions created by ______________________ are Copyright (C) ______ + _______________________. All Rights Reserved. + + Contributor(s): ______________________________________. + + Alternatively, the contents of this file may be used under the terms + of the _____ license (the "[___] License"), in which case the + provisions of [______] License are applicable instead of those + above. If you wish to allow use of your version of this file only + under the terms of the [____] License and not to allow others to use + your version of this file under the MPL, indicate your decision by + deleting the provisions above and replace them with the notice and + other provisions required by the [___] License. If you do not delete + the provisions above, a recipient may use your version of this file + under either the MPL or the [___] License." + + [NOTE: The text of this Exhibit A may differ slightly from the text of + the notices in the Source Code files of the Original Code. You should + use the text of this Exhibit A rather than the text found in the + Original Code Source Code for Your Modifications.] + diff --git a/intl/hyphenation/hyphen/README b/intl/hyphenation/hyphen/README new file mode 100644 index 000000000..82c612724 --- /dev/null +++ b/intl/hyphenation/hyphen/README @@ -0,0 +1,134 @@ +Hyphen - hyphenation library to use converted TeX hyphenation patterns + +(C) 1998 Raph Levien +(C) 2001 ALTLinux, Moscow +(C) 2006, 2007, 2008, 2010, 2011 László Németh + +This was part of libHnj library by Raph Levien. + +Peter Novodvorsky from ALTLinux cut hyphenation part from libHnj +to use it in OpenOffice.org. + +Compound word and non-standard hyphenation support by László Németh. + +License is the original LibHnj license: +LibHnj is dual licensed under LGPL and MPL (see also README.libhnj). + +Because LGPL allows GPL relicensing, COPYING contains now +LGPL/GPL/MPL tri-license for explicit Mozilla source compatibility. + +Original Libhnj source with OOo's patches are managed by Rene Engelhard +and Chris Halls at Debian: + +http://packages.debian.org/stable/libdevel/libhnj-dev +and http://packages.debian.org/unstable/source/libhnj + + +OTHER FILES + +This distribution is the source of the en_US hyphenation patterns +"hyph_en_US.dic", too. See README_hyph_en_US.txt. + +Source files of hyph_en_US.dic in the distribution: + +hyphen.tex (en_US hyphenation patterns from plain TeX) + + Source: http://tug.ctan.org/text-archive/macros/plain/base/hyphen.tex + +tbhyphext.tex: hyphenation exception log from TugBoat archive + + Source of the hyphenation exception list: + http://www.ctan.org/tex-archive/info/digests/tugboat/tb0hyf.tex + + Generated with the hyphenex script + (http://www.ctan.org/tex-archive/info/digests/tugboat/hyphenex.sh) + + sh hyphenex.sh <tb0hyf.tex >tbhyphext.tex + + +INSTALLATION + +autoreconf -fvi +./configure +make +make install + +UNIT TESTS (WITH VALGRIND DEBUGGER) + +make check +VALGRIND=memcheck make check + +USAGE + +./example hyph_en_US.dic mywords.txt + +or (under Linux) + +echo example | ./example hyph_en_US.dic /dev/stdin + +NOTE: In the case of Unicode encoded input, convert your words +to lowercase before hyphenation (under UTF-8 console environment): + +cat mywords.txt | awk '{print tolower($0)}' >mywordslow.txt + +BUILD DLL USING CROSS-COMPILATION + +./configure --host i586-mingw32 --prefix=/tmp/hyphen-dll +make +make install + +DEVELOPMENT + +See README.hyphen for hyphenation algorithm, README.nonstandard +and doc/tb87nemeth.pdf for non-standard hyphenation, +README.compound for compound word hyphenation, and tests/*. + +Description of the dictionary format: + +First line contains the character encoding (ISO8859-x, UTF-8). + +Possible options in the following lines: + +LEFTHYPHENMIN num minimal hyphenation distance from the left word end +RIGHTHYPHENMIN num minimal hyphation distance from the right word end +COMPOUNDLEFTHYPHENMIN num min. hyph. dist. from the left compound word boundary +COMPOUNDRIGHTHYPHENMIN num min. hyph. dist. from the right comp. word boundary + +hyphenation patterns see README.* files + +NEXTWORD separate the two compound sets (see README.compound) + +Default values: +Without explicite declarations, hyphenmin fields of dict struct +are zeroes, but in this case the lefthyphenmin and righthyphenmin +will be the default 2 under the hyphenation (for backward compatibility). + +Comments + +Use percent sign at the beginning of the lines to add comments to your +hpyhenation patterns (after the character encoding in the first line): + +% comment + +***************************************************************************** +* Warning! Correct working of Libhnj *needs* prepared hyphenation patterns. * + +For example, generating hyph_en_US.dic from "hyphen.us" TeX patterns: + +perl substrings.pl hyphen.us hyph_en_US.dic ISO8859-1 + +or with default LEFTHYPHENMIN and RIGHTHYPHENMIN values: + +perl substrings.pl hyphen.us hyph_en_US.dic ISO8859-1 2 3 +perl substrings.pl hyphen.gb hyph_en_GB.dic ISO8859-1 3 3 +**************************************************************************** + +OTHERS + +Java hyphenation: Peter B. West (Folio project) implements a hyphenator with +non standard hyphenation facilities based on extended Libhnj. The HyFo module +is released in binary form as jar files and in source form as zip files. +See http://sourceforge.net/project/showfiles.php?group_id=119136 + +László Németh +<nemeth (at) numbertext (dot) org> diff --git a/intl/hyphenation/hyphen/README.compound b/intl/hyphenation/hyphen/README.compound new file mode 100644 index 000000000..bcb265853 --- /dev/null +++ b/intl/hyphenation/hyphen/README.compound @@ -0,0 +1,87 @@ +New option of Libhyphen 2.7: NOHYPHEN + +Hyphen, apostrophe and other characters may be word boundary characters, +but they don't need (extra) hyphenation. With NOHYPHEN option +it's possible to hyphenate the words parts correctly. + +Example: + +ISO8859-1 +NOHYPHEN -,' +1-1 +1'1 +NEXTLEVEL + +Description: + +1-1 and 1'1 declare hyphen and apostrophe as word boundary characters +and NOHYPHEN with the comma separated character (or character sequence) +list forbid the (extra) hyphens at the hyphen and apostrophe characters. + +Implicite NOHYPHEN declaration + +Without explicite NEXTLEVEL declaration, Hyphen 2.8 uses the +previous settings, plus in UTF-8 encoding, endash (U+2013) and +typographical apostrophe (U+2019) are NOHYPHEN characters, too. + +It's possible to enlarge the hyphenation distance from these +NOHYPHEN characters by using COMPOUNDLEFTHYPHENMIN and +COMPOUNDRIGHTHYPHENMIN attributes. + +Compound word hyphenation + +Hyphen library supports better compound word hyphenation and special +rules of compound word hyphenation of German languages and other +languages with arbitrary number of compound words. The new options, +COMPOUNDLEFTHYPHENMIN and COMPOUNDRIGHTHYPHENMIN help to set the right +style for the hyphenation of compound words. + +Algorithm + +The algorithm is an extension of the original pattern based hyphenation +algorithm. It uses two hyphenation pattern sets, defined in the same +pattern file and separated by the NEXTLEVEL keyword. First pattern +set is for hyphenation only at compound word boundaries, the second one +is for hyphenation within words or word parts. + +Recursive compound level hyphenation + +The algorithm is recursive: every word parts of a successful +first (compound) level hyphenation will be rehyphenated +by the same (first) pattern set. + +Finally, when first level hyphenation is not possible, Hyphen uses +the second level hyphenation for the word or the word parts. + +Word endings and word parts + +Patterns for word endings (patterns with ellipses) match the +word parts, too. + +Options + +COMPOUNDLEFTHYPHENMIN: min. hyph. dist. from the left compound word boundary +COMPOUNDRIGHTHYPHENMIN: min. hyph. dist. from the right comp. word boundary +NEXTLEVEL: sign second level hyphenation patterns + +Default hyphenmin values + +Default values of COMPOUNDLEFTHYPHENMIN and COMPOUNDRIGHTHYPHENMIN are 0, +and 0 under the hyphenation, too. ("0" values of +LEFTHYPHENMIN and RIGHTHYPHENMIN mean the default "2" under the hyphenation.) + +Examples + +See tests/compound* test files. + +Preparation of hyphenation patterns + +It hasn't been special pattern generator tool for compound hyphenation +patterns, yet. It is possible to use PATGEN to generate both of +pattern sets, concatenate it manually and set the requested HYPHENMIN values. +(But don't forget the preprocessing steps by substrings.pl before +concatenation.) One of the disadvantage of this method, that PATGEN +doesn't know recursive compound hyphenation of Hyphen. + +László Németh +<nemeth (at) openoffice.org> diff --git a/intl/hyphenation/hyphen/README.hyphen b/intl/hyphenation/hyphen/README.hyphen new file mode 100644 index 000000000..8aa8c8767 --- /dev/null +++ b/intl/hyphenation/hyphen/README.hyphen @@ -0,0 +1,108 @@ +Brief explanation of the hyphenation algorithm herein.[1] + +Raph Levien <raph@acm.org> +4 Aug 1998 + + The hyphenation algorithm is basically the same as Knuth's TeX +algorithm. However, the implementation is quite a bit faster. + + The hyphenation files from TeX can almost be used directly. There +is a preprocessing step, however. If you don't do the preprocessing +step, you'll get bad hyphenations (i.e. a silent failure). + + Start with a file such as hyphen.us. This is the TeX ushyph1.tex +file, with the exception dictionary encoded using the same rules as +the main portion of the file. Any line beginning with % is a comment. +Each other line should contain exactly one rule. + + Then, do the preprocessing - "perl substrings.pl hyphen.us". The +resulting file is hyphen.mashed. It's in Perl, and it's fairly slow +(it uses brute force algorithms; about 17 seconds on a P100), but it +could probably be redone in C with clever algorithms. This would be +valuable, for example, if it was handle user-supplied exception +dictionaries by integrating them into the rule table.[2] + + Once the rules are preprocessed, loading them is quite quick - +about 200ms on a P100. It then hyphenates at about 40,000 words per +second on a P100. I haven't benchmarked it against other +implementations (both TeX and groff contain essentially the same +algorithm), but expect that it runs quite a bit faster than any of +them. + +Knuth's algorithm + + This section contains a brief explanation of Knuth's algorithm, in +case you missed it from the TeX books. We'll use the semi-word +"example" as our running example. + + Since the beginning and end of a word are special, the algorithm is +actually run over the prepared word (prep_word in the source) +".example.". Knuths algorithm basically just does pattern matches from +the rule set, then applies the matches. The patterns in this case that +match are "xa", "xam", "mp", and "pl". These are actually stored as +"x1a", "xam3", "4m1p", and "1p2l2". Whenever numbers appear between +the letters, they are added in. If two (or more) patterns have numbers +in the same place, the highest number wins. Here's the example: + + . e x a m p l e . + x1a + x a m3 + 4m1p + 1p2l2 + ----------------- + . e x1a4m3p2l2e . + + Finally, hyphens are placed wherever odd numbers appear. They are, +however, suppressed after the first letter and before the last letter +of the word (TeX actually suppresses them before the next-to-last, as +well). So, it's "ex-am-ple", which is correct. + + Knuth uses a trie to implement this. I.e. he stores each rule in a +trie structure. For each position in the word, he searches the trie, +searching for a match. Most patterns are short, so efficiency should +be quite good. + +Theory of the algorithm + + The algorithm works as a slightly modified finite state machine. +There are two kinds of transitions: those that consume one letter of +input (which work just like your regular finite state machine), and +"fallback" transitions, which don't consume any input. If no +transition matching the next letter is found, the fallback is used. +One way of looking at this is a form of compression of the transition +tables - i.e. it behaves the same as a completely vanilla state +machine in which the actual transition table of a node is made up of +the union of transition tables of the node itself, plus its fallbacks. + + Each state is represented by a string. Thus, if the current state +is "am" and the next letter is "p", then the next state is "amp". +Fallback transitions go to states which chop off one or (sometimes) +more letters from the beginning. For example, if none of the +transitions from "amp" match the next letter, then it will fall back +to "mp". Similarly, if none of the transitions from "mp" match the +next letter, it will fall back to "m". + + Each state is also associated with a (possibly null) "match" +string. This represents the union of all patterns which are +right-justified substrings of the match string. I.e. the pattern "mp" +is a right-justified substring of the state "amp", so it's numbers get +added in. The actual calculation of this union is done by the +Perl preprocessing script, but could probably be done in C just about +as easily. + + Because each state transition either consumes one input character +or shortens the state string by one character, the total number of +state transitions is linear in the length of the word. + +[1] Documentations: + +Franklin M. Liang: Word Hy-phen-a-tion by Com-put-er. +Stanford University, 1983. http://www.tug.org/docs/liang. + +László Németh: Automatic non-standard hyphenation in OpenOffice.org, +TUGboat (27), 2006. No. 2., http://hunspell.sourceforge.net/tb87nemeth.pdf + +[2] There is the C version of pattern converter "substrings.c" +in the distribution written by Nanning Buitenhuis. Unfortunatelly, +this version hasn't handled the non standard extension of the +algorithm, yet. diff --git a/intl/hyphenation/hyphen/README.nonstandard b/intl/hyphenation/hyphen/README.nonstandard new file mode 100644 index 000000000..fd80d12c6 --- /dev/null +++ b/intl/hyphenation/hyphen/README.nonstandard @@ -0,0 +1,122 @@ +Non-standard hyphenation +------------------------ + +Some languages use non-standard hyphenation; `discretionary' +character changes at hyphenation points. For example, +Catalan: paral·lel -> paral-lel, +Dutch: omaatje -> oma-tje, +German (before the new orthography): Schiffahrt -> Schiff-fahrt, +Hungarian: asszonnyal -> asz-szony-nyal (multiple occurance!) +Swedish: tillata -> till-lata. + +Using this extended library, you can define +non-standard hyphenation patterns. For example: + +l·1l/l=l +a1atje./a=t,1,3 +.schif1fahrt/ff=f,5,2 +.as3szon/sz=sz,2,3 +n1nyal./ny=ny,1,3 +.til1lata./ll=l,3,2 + +or with narrow boundaries: + +l·1l/l=,1,2 +a1atje./a=,1,1 +.schif1fahrt/ff=,5,1 +.as3szon/sz=,2,1 +n1nyal./ny=,1,1 +.til1lata./ll=,3,1 + +Note: Libhnj uses modified patterns by preparing substrings.pl. +Unfortunatelly, now the conversion step can generate bad non-standard +patterns (non-standard -> standard pattern conversion), so using +narrow boundaries may be better for recent Libhnj. For example, +substrings.pl generates a few bad patterns for Hungarian hyphenation +patterns resulting bad non-standard hyphenation in a few cases. Using narrow +boundaries solves this problem. Java HyFo module can check this problem. + +Syntax of the non-standard hyphenation patterns +------------------------------------------------ + +pat1tern/change[,start,cut] + +If this pattern matches the word, and this pattern win (see README.hyphen) +in the change region of the pattern, then pattern[start, start + cut - 1] +substring will be replaced with the "change". + +For example, a German ff -> ff-f hyphenation: + +f1f/ff=f + +or with expansion + +f1f/ff=f,1,2 + +will change every "ff" with "ff=f" at hyphenation. + +A more real example: + +% simple ff -> f-f hyphenation +f1f +% Schiffahrt -> Schiff-fahrt hyphenation +% +schif3fahrt/ff=f,5,2 + +Specification + +- Pattern: matching patterns of the original Liang's algorithm + - patterns must contain only one hyphenation point at change region + signed with an one-digit odd number (1, 3, 5, 7 or 9). + These point may be at subregion boundaries: schif3fahrt/ff=,5,1 + - only the greater value guarantees the win (don't mix non-standard and + non-standard patterns with the same value, for example + instead of f3f and schif3fahrt/ff=f,5,2 use f3f and schif5fahrt/ff=f,5,2) + +- Change: new characters. + Arbitrary character sequence. Equal sign (=) signs hyphenation points + for OpenOffice.org (like in the example). (In a possible German LaTeX + preprocessor, ff could be replaced with "ff, for a Hungarian one, ssz + with `ssz, according to the German and Hungarian Babel settings.) + +- Start: starting position of the change region. + - begins with 1 (not 0): schif3fahrt/ff=f,5,2 + - start dot doesn't matter: .schif3fahrt/ff=f,5,2 + - numbers don't matter: .s2c2h2i2f3f2ahrt/ff=f,5,2 + - In UTF-8 encoding, use Unicode character positions: össze/sz=sz,2,3 + ("össze" looks "össze" in an ISO 8859-1 8-bit editor). + +- Cut: length of the removed character sequence in the original word. + - In UTF-8 encoding, use Unicode character length: paral·1lel/l=l,5,3 + ("paral·lel" looks "paral·1lel" in an ISO 8859-1 8-bit editor). + +Dictionary developing +--------------------- + +There hasn't been extended PatGen pattern generator for non-standard +hyphenation patterns, yet. + +Fortunatelly, non-standard hyphenation points are forbidden in the PatGen +generated hyphenation patterns, so with a little patch can be develop +non-standard hyphenation patterns also in this case. + +Warning: If you use UTF-8 Unicode encoding in your patterns, call +substrings.pl with UTF-8 parameter to calculate right +character positions for non-standard hyphenation: + +./substrings.pl input output UTF-8 + +Programming +----------- + +Use hyphenate2() or hyphenate3() to handle non-standard hyphenation. +See hyphen.h for the documentation of the hyphenate*() functions. +See example.c for processing the output of the hyphenate*() functions. + +Warning: change characters are lower cased in the source, so you may need +case conversion of the change characters based on input word case detection. +For example, see OpenOffice.org source +(lingucomponent/source/hyphenator/altlinuxhyph/hyphen/hyphenimp.cxx). + +László Németh +<nemeth (at) openoffice.org> diff --git a/intl/hyphenation/hyphen/hyphen.c b/intl/hyphenation/hyphen/hyphen.c new file mode 100644 index 000000000..9a132d026 --- /dev/null +++ b/intl/hyphenation/hyphen/hyphen.c @@ -0,0 +1,1187 @@ +/* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both + * licenses follows. + */ + +/* LibHnj - a library for high quality hyphenation and justification + * Copyright (C) 1998 Raph Levien, + * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), + * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su) + * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307 USA. +*/ + +/* + * The contents of this file are subject to the Mozilla Public License + * Version 1.0 (the "MPL"); you may not use this file except in + * compliance with the MPL. You may obtain a copy of the MPL at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the MPL is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL + * for the specific language governing rights and limitations under the + * MPL. + * + */ +#include <stdlib.h> /* for NULL, malloc */ +#include <stdio.h> /* for fprintf */ +#include <string.h> /* for strdup */ +#include <limits.h> /* for INT_MAX */ + +#ifdef UNX +#include <unistd.h> /* for exit */ +#endif + +#define noVERBOSE + +/* calculate hyphenmin values with long ligature length (2 or 3 characters + * instead of 1 or 2) for comparison with hyphenation without ligatures */ +#define noLONG_LIGATURE + +#ifdef LONG_LIGATURE +#define LIG_xx 1 +#define LIG_xxx 2 +#else +#define LIG_xx 0 +#define LIG_xxx 1 +#endif + +#include "hnjalloc.h" +#include "hyphen.h" + +static char * +hnj_strdup (const char *s) +{ + char *newstr; + int l; + + l = strlen (s); + newstr = (char *) hnj_malloc (l + 1); + memcpy (newstr, s, l); + newstr[l] = 0; + return newstr; +} + +/* remove cross-platform text line end characters */ +void hnj_strchomp(char * s) +{ + int k = strlen(s); + if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0'; + if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; +} + +/* a little bit of a hash table implementation. This simply maps strings + to state numbers */ + +typedef struct _HashTab HashTab; +typedef struct _HashEntry HashEntry; + +/* A cheap, but effective, hack. */ +#define HASH_SIZE 31627 + +struct _HashTab { + HashEntry *entries[HASH_SIZE]; +}; + +struct _HashEntry { + HashEntry *next; + char *key; + int val; +}; + +/* a char* hash function from ASU - adapted from Gtk+ */ +static unsigned int +hnj_string_hash (const char *s) +{ + const char *p; + unsigned int h=0, g; + for(p = s; *p != '\0'; p += 1) { + h = ( h << 4 ) + *p; + if ( ( g = h & 0xf0000000 ) ) { + h = h ^ (g >> 24); + h = h ^ g; + } + } + return h /* % M */; +} + +static HashTab * +hnj_hash_new (void) +{ + HashTab *hashtab; + int i; + + hashtab = (HashTab *) hnj_malloc (sizeof(HashTab)); + for (i = 0; i < HASH_SIZE; i++) + hashtab->entries[i] = NULL; + + return hashtab; +} + +static void +hnj_hash_free (HashTab *hashtab) +{ + int i; + HashEntry *e, *next; + + for (i = 0; i < HASH_SIZE; i++) + for (e = hashtab->entries[i]; e; e = next) + { + next = e->next; + hnj_free (e->key); + hnj_free (e); + } + + hnj_free (hashtab); +} + +/* assumes that key is not already present! */ +static void +hnj_hash_insert (HashTab *hashtab, const char *key, int val) +{ + int i; + HashEntry *e; + + i = hnj_string_hash (key) % HASH_SIZE; + e = (HashEntry *) hnj_malloc (sizeof(HashEntry)); + e->next = hashtab->entries[i]; + e->key = hnj_strdup (key); + e->val = val; + hashtab->entries[i] = e; +} + +/* return val if found, otherwise -1 */ +static int +hnj_hash_lookup (HashTab *hashtab, const char *key) +{ + int i; + HashEntry *e; + i = hnj_string_hash (key) % HASH_SIZE; + for (e = hashtab->entries[i]; e; e = e->next) + if (!strcmp (key, e->key)) + return e->val; + return -1; +} + +/* Get the state number, allocating a new state if necessary. */ +static int +hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string) +{ + int state_num; + + state_num = hnj_hash_lookup (hashtab, string); + + if (state_num >= 0) + return state_num; + + hnj_hash_insert (hashtab, string, dict->num_states); + /* predicate is true if dict->num_states is a power of two */ + if (!(dict->num_states & (dict->num_states - 1))) + { + dict->states = (HyphenState *) hnj_realloc (dict->states, + (dict->num_states << 1) * + sizeof(HyphenState)); + } + dict->states[dict->num_states].match = NULL; + dict->states[dict->num_states].repl = NULL; + dict->states[dict->num_states].fallback_state = -1; + dict->states[dict->num_states].num_trans = 0; + dict->states[dict->num_states].trans = NULL; + return dict->num_states++; +} + +/* add a transition from state1 to state2 through ch - assumes that the + transition does not already exist */ +static void +hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch) +{ + int num_trans; + + num_trans = dict->states[state1].num_trans; + if (num_trans == 0) + { + dict->states[state1].trans = (HyphenTrans *) hnj_malloc (sizeof(HyphenTrans)); + } + else if (!(num_trans & (num_trans - 1))) + { + dict->states[state1].trans = (HyphenTrans *) hnj_realloc (dict->states[state1].trans, + (num_trans << 1) * + sizeof(HyphenTrans)); + } + dict->states[state1].trans[num_trans].ch = ch; + dict->states[state1].trans[num_trans].new_state = state2; + dict->states[state1].num_trans++; +} + +#ifdef VERBOSE +HashTab *global[1]; + +static char * +get_state_str (int state, int level) +{ + int i; + HashEntry *e; + + for (i = 0; i < HASH_SIZE; i++) + for (e = global[level]->entries[i]; e; e = e->next) + if (e->val == state) + return e->key; + return NULL; +} +#endif + +void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) { + int i, j; + char word[MAX_CHARS]; + char pattern[MAX_CHARS]; + char * repl; + signed char replindex; + signed char replcut; + int state_num = 0; + int last_state; + char ch; + int found; + + if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) { + dict->lhmin = atoi(buf + 13); + return; + } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) { + dict->rhmin = atoi(buf + 14); + return; + } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) { + dict->clhmin = atoi(buf + 21); + return; + } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) { + dict->crhmin = atoi(buf + 22); + return; + } else if (strncmp(buf, "NOHYPHEN", 8) == 0) { + char * space = buf + 8; + while (*space != '\0' && (*space == ' ' || *space == '\t')) space++; + if (*buf != '\0') dict->nohyphen = hnj_strdup(space); + if (dict->nohyphen) { + char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1; + *nhe = 0; + for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) { + if (*nhe == ',') { + dict->nohyphenl++; + *nhe = 0; + } + } + } + return; + } + j = 0; + pattern[j] = '0'; + repl = strchr(buf, '/'); + replindex = 0; + replcut = 0; + if (repl) { + char * index = strchr(repl + 1, ','); + *repl = '\0'; + if (index) { + char * index2 = strchr(index + 1, ','); + *index = '\0'; + if (index2) { + *index2 = '\0'; + replindex = (signed char) atoi(index + 1) - 1; + replcut = (signed char) atoi(index2 + 1); + } + } else { + hnj_strchomp(repl + 1); + replindex = 0; + replcut = (signed char) strlen(buf); + } + repl = hnj_strdup(repl + 1); + } + for (i = 0; (unsigned char)buf[i] > (unsigned char)' '; i++) + { + if (buf[i] >= '0' && buf[i] <= '9') + pattern[j] = buf[i]; + else + { + word[j] = buf[i]; + pattern[++j] = '0'; + } + } + word[j] = '\0'; + pattern[j + 1] = '\0'; + + i = 0; + if (!repl) { + /* Optimize away leading zeroes */ + for (; pattern[i] == '0'; i++); + } else { + if (*word == '.') i++; + /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */ + if (dict->utf8) { + int pu = -1; /* unicode character position */ + int ps = -1; /* unicode start position (original replindex) */ + size_t pc = (*word == '.') ? 1: 0; /* 8-bit character position */ + for (; pc < (strlen(word) + 1); pc++) { + /* beginning of an UTF-8 character (not '10' start bits) */ + if ((((unsigned char) word[pc]) >> 6) != 2) pu++; + if ((ps < 0) && (replindex == pu)) { + ps = replindex; + replindex = (signed char) pc; + } + if ((ps >= 0) && ((pu - ps) == replcut)) { + replcut = (signed char) (pc - replindex); + break; + } + } + if (*word == '.') replindex--; + } + } + +#ifdef VERBOSE + printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl); +#endif + found = hnj_hash_lookup (hashtab, word); + state_num = hnj_get_state (dict, hashtab, word); + dict->states[state_num].match = hnj_strdup (pattern + i); + dict->states[state_num].repl = repl; + dict->states[state_num].replindex = replindex; + if (!replcut) { + dict->states[state_num].replcut = (signed char) strlen(word); + } else { + dict->states[state_num].replcut = replcut; + } + + /* now, put in the prefix transitions */ + for (; found < 0 && j > 0; --j) + { + last_state = state_num; + ch = word[j - 1]; + word[j - 1] = '\0'; + found = hnj_hash_lookup (hashtab, word); + state_num = hnj_get_state (dict, hashtab, word); + hnj_add_trans (dict, state_num, last_state, ch); + } +} + +HyphenDict * +hnj_hyphen_load (const char *fn) +{ + HyphenDict *result; + FILE *f; + f = fopen (fn, "r"); + if (f == NULL) + return NULL; + + result = hnj_hyphen_load_file(f); + + fclose(f); + return result; +} + +HyphenDict * +hnj_hyphen_load_file (FILE *f) +{ + HyphenDict *dict[2]; + HashTab *hashtab; + char buf[MAX_CHARS]; + int nextlevel = 0; + int i, j, k; + HashEntry *e; + int state_num = 0; +/* loading one or two dictionaries (separated by NEXTLEVEL keyword) */ +for (k = 0; k < 2; k++) { + hashtab = hnj_hash_new (); +#ifdef VERBOSE + global[k] = hashtab; +#endif + hnj_hash_insert (hashtab, "", 0); + dict[k] = (HyphenDict *) hnj_malloc (sizeof(HyphenDict)); + dict[k]->num_states = 1; + dict[k]->states = (HyphenState *) hnj_malloc (sizeof(HyphenState)); + dict[k]->states[0].match = NULL; + dict[k]->states[0].repl = NULL; + dict[k]->states[0].fallback_state = -1; + dict[k]->states[0].num_trans = 0; + dict[k]->states[0].trans = NULL; + dict[k]->nextlevel = NULL; + dict[k]->lhmin = 0; + dict[k]->rhmin = 0; + dict[k]->clhmin = 0; + dict[k]->crhmin = 0; + dict[k]->nohyphen = NULL; + dict[k]->nohyphenl = 0; + + /* read in character set info */ + if (k == 0) { + for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0; + if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) { + for (i=0;i<MAX_NAME;i++) + if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n')) + dict[k]->cset[i] = 0; + } else { + dict[k]->cset[0] = 0; + } + dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0); + } else { + strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1); + dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0'; + dict[k]->utf8 = dict[0]->utf8; + } + + if (k == 0 || nextlevel) { + while (fgets (buf, sizeof(buf), f) != NULL) { + if (strncmp(buf, "NEXTLEVEL", 9) == 0) { + nextlevel = 1; + break; + } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab); + } + } else if (k == 1) { + /* default first level: hyphen and ASCII apostrophe */ + if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab); + else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab); + strncpy(buf, "1-1\n", MAX_CHARS-1); /* buf rewritten by hnj_hyphen_load here */ + buf[MAX_CHARS-1] = '\0'; + hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */ + hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */ + if (dict[0]->utf8) { + hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */ + hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */ + } + } + + /* Could do unioning of matches here (instead of the preprocessor script). + If we did, the pseudocode would look something like this: + + foreach state in the hash table + foreach i = [1..length(state) - 1] + state to check is substr (state, i) + look it up + if found, and if there is a match, union the match in. + + It's also possible to avoid the quadratic blowup by doing the + search in order of increasing state string sizes - then you + can break the loop after finding the first match. + + This step should be optional in any case - if there is a + preprocessed rule table, it's always faster to use that. + +*/ + + /* put in the fallback states */ + for (i = 0; i < HASH_SIZE; i++) + for (e = hashtab->entries[i]; e; e = e->next) + { + if (*(e->key)) for (j = 1; 1; j++) + { + state_num = hnj_hash_lookup (hashtab, e->key + j); + if (state_num >= 0) + break; + } + /* KBH: FIXME state 0 fallback_state should always be -1? */ + if (e->val) + dict[k]->states[e->val].fallback_state = state_num; + } +#ifdef VERBOSE + for (i = 0; i < HASH_SIZE; i++) + for (e = hashtab->entries[i]; e; e = e->next) + { + printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val, + dict[k]->states[e->val].fallback_state); + for (j = 0; j < dict[k]->states[e->val].num_trans; j++) + printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch, + dict[k]->states[e->val].trans[j].new_state); + } +#endif + +#ifndef VERBOSE + hnj_hash_free (hashtab); +#endif + state_num = 0; +} + if (nextlevel) dict[0]->nextlevel = dict[1]; + else { + dict[1] -> nextlevel = dict[0]; + dict[1]->lhmin = dict[0]->lhmin; + dict[1]->rhmin = dict[0]->rhmin; + dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3); + dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3); +#ifdef VERBOSE + HashTab *r = global[0]; + global[0] = global[1]; + global[1] = r; +#endif + return dict[1]; + } + return dict[0]; +} + +void hnj_hyphen_free (HyphenDict *dict) +{ + int state_num; + HyphenState *hstate; + + for (state_num = 0; state_num < dict->num_states; state_num++) + { + hstate = &dict->states[state_num]; + if (hstate->match) + hnj_free (hstate->match); + if (hstate->repl) + hnj_free (hstate->repl); + if (hstate->trans) + hnj_free (hstate->trans); + } + if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel); + + if (dict->nohyphen) hnj_free(dict->nohyphen); + + hnj_free (dict->states); + + hnj_free (dict); +} + +#define MAX_WORD 256 + +int hnj_hyphen_hyphenate (HyphenDict *dict, + const char *word, int word_size, + char *hyphens) +{ + char *prep_word; + int i, j, k; + int state; + char ch; + HyphenState *hstate; + char *match; + int offset; + + prep_word = (char*) hnj_malloc (word_size + 3); + + j = 0; + prep_word[j++] = '.'; + + for (i = 0; i < word_size; i++) { + if (word[i] <= '9' && word[i] >= '0') { + prep_word[j++] = '.'; + } else { + prep_word[j++] = word[i]; + } + } + + prep_word[j++] = '.'; + prep_word[j] = '\0'; + + for (i = 0; i < word_size + 5; i++) + hyphens[i] = '0'; + +#ifdef VERBOSE + printf ("prep_word = %s\n", prep_word); +#endif + + /* now, run the finite state machine */ + state = 0; + for (i = 0; i < j; i++) + { + ch = prep_word[i]; + for (;;) + { + + if (state == -1) { + /* return 1; */ + /* KBH: FIXME shouldn't this be as follows? */ + state = 0; + goto try_next_letter; + } + +#ifdef VERBOSE + char *state_str; + state_str = get_state_str (state, 0); + + for (k = 0; k < i - strlen (state_str); k++) + putchar (' '); + printf ("%s", state_str); +#endif + + hstate = &dict->states[state]; + for (k = 0; k < hstate->num_trans; k++) + if (hstate->trans[k].ch == ch) + { + state = hstate->trans[k].new_state; + goto found_state; + } + state = hstate->fallback_state; +#ifdef VERBOSE + printf (" falling back, fallback_state %d\n", state); +#endif + } + found_state: +#ifdef VERBOSE + printf ("found state %d\n",state); +#endif + /* Additional optimization is possible here - especially, + elimination of trailing zeroes from the match. Leading zeroes + have already been optimized. */ + match = dict->states[state].match; + /* replacing rules not handled by hyphen_hyphenate() */ + if (match && !dict->states[state].repl) + { + offset = i + 1 - strlen (match); +#ifdef VERBOSE + for (k = 0; k < offset; k++) + putchar (' '); + printf ("%s\n", match); +#endif + /* This is a linear search because I tried a binary search and + found it to be just a teeny bit slower. */ + for (k = 0; match[k]; k++) + if (hyphens[offset + k] < match[k]) + hyphens[offset + k] = match[k]; + } + + /* KBH: we need this to make sure we keep looking in a word */ + /* for patterns even if the current character is not known in state 0 */ + /* since patterns for hyphenation may occur anywhere in the word */ + try_next_letter: ; + + } +#ifdef VERBOSE + for (i = 0; i < j; i++) + putchar (hyphens[i]); + putchar ('\n'); +#endif + + for (i = 0; i < j - 4; i++) +#if 0 + if (hyphens[i + 1] & 1) + hyphens[i] = '-'; +#else + hyphens[i] = hyphens[i + 1]; +#endif + hyphens[0] = '0'; + for (; i < word_size; i++) + hyphens[i] = '0'; + hyphens[word_size] = '\0'; + + hnj_free (prep_word); + + return 0; +} + +/* Unicode ligature length */ +int hnj_ligature(unsigned char c) { + switch (c) { + case 0x80: /* ff */ + case 0x81: /* fi */ + case 0x82: return LIG_xx; /* fl */ + case 0x83: /* ffi */ + case 0x84: return LIG_xxx; /* ffl */ + case 0x85: /* long st */ + case 0x86: return LIG_xx; /* st */ + } + return 0; +} + +/* character length of the first n byte of the input word */ +int hnj_hyphen_strnlen(const char * word, int n, int utf8) +{ + int i = 0; + int j = 0; + while (j < n && word[j] != '\0') { + i++; + /* Unicode ligature support */ + if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) { + i += hnj_ligature(word[j + 2]); + } + for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++); + } + return i; +} + +int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens, + char *** rep, int ** pos, int ** cut, int lhmin) +{ + int i = 1, j; + + /* Unicode ligature support */ + if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) { + i += hnj_ligature(word[2]); + } + + /* ignore numbers */ + for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--; + + for (j = 0; i < lhmin && word[j] != '\0'; i++) do { + /* check length of the non-standard part */ + if (*rep && *pos && *cut && (*rep)[j]) { + char * rh = strchr((*rep)[j], '='); + if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) + + hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) { + free((*rep)[j]); + (*rep)[j] = NULL; + hyphens[j] = '0'; + } + } else { + hyphens[j] = '0'; + } + j++; + + /* Unicode ligature support */ + if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) { + i += hnj_ligature(word[j + 2]); + } + } while (utf8 && (word[j] & 0xc0) == 0x80); + return 0; +} + +int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens, + char *** rep, int ** pos, int ** cut, int rhmin) +{ + int i = 0; + int j; + + /* ignore numbers */ + for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--; + + for (j = word_size - 1; i < rhmin && j > 0; j--) { + /* check length of the non-standard part */ + if (*rep && *pos && *cut && (*rep)[j]) { + char * rh = strchr((*rep)[j], '='); + if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) + + hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) { + free((*rep)[j]); + (*rep)[j] = NULL; + hyphens[j] = '0'; + } + } else { + hyphens[j] = '0'; + } + if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++; + } + return 0; +} + +/* recursive function for compound level hyphenation */ +int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, + char * hyphens, char *** rep, int ** pos, int ** cut, + int clhmin, int crhmin, int lend, int rend) +{ + char *prep_word; + int i, j, k; + int state; + char ch; + HyphenState *hstate; + char *match; + char *repl; + signed char replindex; + signed char replcut; + int offset; + int * matchlen; + int * matchindex; + char ** matchrepl; + int isrepl = 0; + int nHyphCount; + + size_t prep_word_size = word_size + 3; + prep_word = (char*) hnj_malloc (prep_word_size); + matchlen = (int*) hnj_malloc ((word_size + 3) * sizeof(int)); + matchindex = (int*) hnj_malloc ((word_size + 3) * sizeof(int)); + matchrepl = (char**) hnj_malloc ((word_size + 3) * sizeof(char *)); + + j = 0; + prep_word[j++] = '.'; + + for (i = 0; i < word_size; i++) { + if (word[i] <= '9' && word[i] >= '0') { + prep_word[j++] = '.'; + } else { + prep_word[j++] = word[i]; + } + } + + + + prep_word[j++] = '.'; + prep_word[j] = '\0'; + + for (i = 0; i < j; i++) + hyphens[i] = '0'; + +#ifdef VERBOSE + printf ("prep_word = %s\n", prep_word); +#endif + + /* now, run the finite state machine */ + state = 0; + for (i = 0; i < j; i++) + { + ch = prep_word[i]; + for (;;) + { + + if (state == -1) { + /* return 1; */ + /* KBH: FIXME shouldn't this be as follows? */ + state = 0; + goto try_next_letter; + } + +#ifdef VERBOSE + char *state_str; + state_str = get_state_str (state, 1); + + for (k = 0; k < i - strlen (state_str); k++) + putchar (' '); + printf ("%s", state_str); +#endif + + hstate = &dict->states[state]; + for (k = 0; k < hstate->num_trans; k++) + if (hstate->trans[k].ch == ch) + { + state = hstate->trans[k].new_state; + goto found_state; + } + state = hstate->fallback_state; +#ifdef VERBOSE + printf (" falling back, fallback_state %d\n", state); +#endif + } + found_state: +#ifdef VERBOSE + printf ("found state %d\n",state); +#endif + /* Additional optimization is possible here - especially, + elimination of trailing zeroes from the match. Leading zeroes + have already been optimized. */ + match = dict->states[state].match; + repl = dict->states[state].repl; + replindex = dict->states[state].replindex; + replcut = dict->states[state].replcut; + /* replacing rules not handled by hyphen_hyphenate() */ + if (match) + { + offset = i + 1 - strlen (match); +#ifdef VERBOSE + for (k = 0; k < offset; k++) + putchar (' '); + printf ("%s (%s)\n", match, repl); +#endif + if (repl) { + if (!isrepl) for(; isrepl < word_size; isrepl++) { + matchrepl[isrepl] = NULL; + matchindex[isrepl] = -1; + } + matchlen[offset + replindex] = replcut; + } + /* This is a linear search because I tried a binary search and + found it to be just a teeny bit slower. */ + for (k = 0; match[k]; k++) { + if ((hyphens[offset + k] < match[k])) { + hyphens[offset + k] = match[k]; + if (match[k]&1) { + matchrepl[offset + k] = repl; + if (repl && (k >= replindex) && (k <= replindex + replcut)) { + matchindex[offset + replindex] = offset + k; + } + } + } + } + + } + + /* KBH: we need this to make sure we keep looking in a word */ + /* for patterns even if the current character is not known in state 0 */ + /* since patterns for hyphenation may occur anywhere in the word */ + try_next_letter: ; + + } +#ifdef VERBOSE + for (i = 0; i < j; i++) + putchar (hyphens[i]); + putchar ('\n'); +#endif + + for (i = 0; i < j - 3; i++) +#if 0 + if (hyphens[i + 1] & 1) + hyphens[i] = '-'; +#else + hyphens[i] = hyphens[i + 1]; +#endif + for (; i < word_size; i++) + hyphens[i] = '0'; + hyphens[word_size] = '\0'; + + /* now create a new char string showing hyphenation positions */ + /* count the hyphens and allocate space for the new hyphenated string */ + nHyphCount = 0; + for (i = 0; i < word_size; i++) + if (hyphens[i]&1) + nHyphCount++; + j = 0; + for (i = 0; i < word_size; i++) { + if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) { + if (rep && pos && cut) { + if (!*rep) + *rep = (char **) calloc(word_size, sizeof(char *)); + if (!*pos) + *pos = (int *) calloc(word_size, sizeof(int)); + if (!*cut) { + *cut = (int *) calloc(word_size, sizeof(int)); + } + (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]); + (*pos)[matchindex[i] - 1] = matchindex[i] - i; + (*cut)[matchindex[i] - 1] = matchlen[i]; + } + j += strlen(matchrepl[matchindex[i]]); + i += matchlen[i] - 1; + } + } + + hnj_free (matchrepl); + hnj_free (matchlen); + hnj_free (matchindex); + + /* recursive hyphenation of the first (compound) level segments */ + if (dict->nextlevel) { + char ** rep2; + int * pos2; + int * cut2; + char * hyphens2; + int begin = 0; + + rep2 = (char**) hnj_malloc (word_size * sizeof(char *)); + pos2 = (int*) hnj_malloc (word_size * sizeof(int)); + cut2 = (int*) hnj_malloc (word_size * sizeof(int)); + hyphens2 = (char*) hnj_malloc (word_size + 3); + for (i = 0; i < word_size; i++) rep2[i] = NULL; + for (i = 0; i < word_size; i++) if + (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) { + if (i - begin > 0) { + int hyph = 0; + prep_word[i + 2] = '\0'; + /* non-standard hyphenation at compound boundary (Schiffahrt) */ + if (rep && *rep && *pos && *cut && (*rep)[i]) { + char * l = strchr((*rep)[i], '='); + size_t offset = 2 + i - (*pos)[i]; + strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1); + prep_word[prep_word_size - 1] = '\0'; + if (l) { + hyph = (l - (*rep)[i]) - (*pos)[i]; + prep_word[2 + i + hyph] = '\0'; + } + } + hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph, + hyphens2, &rep2, &pos2, &cut2, clhmin, + crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend)); + for (j = 0; j < i - begin; j++) { + hyphens[begin + j] = hyphens2[j]; + if (rep2[j] && rep && pos && cut) { + if (!*rep && !*pos && !*cut) { + int k; + *rep = (char **) malloc(sizeof(char *) * word_size); + *pos = (int *) malloc(sizeof(int) * word_size); + *cut = (int *) malloc(sizeof(int) * word_size); + for (k = 0; k < word_size; k++) { + (*rep)[k] = NULL; + (*pos)[k] = 0; + (*cut)[k] = 0; + } + } + (*rep)[begin + j] = rep2[j]; + (*pos)[begin + j] = pos2[j]; + (*cut)[begin + j] = cut2[j]; + } + } + prep_word[i + 2] = word[i + 1]; + if (*rep && *pos && *cut && (*rep)[i]) { + size_t offset = 1; + strncpy(prep_word + offset, word, prep_word_size - offset - 1); + prep_word[prep_word_size - 1] = '\0'; + } + } + begin = i + 1; + for (j = 0; j < word_size; j++) rep2[j] = NULL; + } + + /* non-compound */ + if (begin == 0) { + hnj_hyphen_hyph_(dict->nextlevel, word, word_size, + hyphens, rep, pos, cut, clhmin, crhmin, lend, rend); + if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, + rep, pos, cut, clhmin); + if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, + rep, pos, cut, crhmin); + } + + free(rep2); + free(cut2); + free(pos2); + free(hyphens2); + } + + hnj_free (prep_word); + return 0; +} + +/* UTF-8 normalization of hyphen and non-standard positions */ +int hnj_hyphen_norm(const char *word, int word_size, char * hyphens, + char *** rep, int ** pos, int ** cut) +{ + int i, j, k; + if ((((unsigned char) word[0]) >> 6) == 2) { + fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word); + return 1; + } + + /* calculate UTF-8 character positions */ + for (i = 0, j = -1; i < word_size; i++) { + /* beginning of an UTF-8 character (not '10' start bits) */ + if ((((unsigned char) word[i]) >> 6) != 2) j++; + hyphens[j] = hyphens[i]; + if (rep && pos && cut && *rep && *pos && *cut) { + int l = (*pos)[i]; + (*pos)[j] = 0; + for (k = 0; k < l; k++) { + if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++; + } + k = i - l + 1; + l = k + (*cut)[i]; + (*cut)[j] = 0; + for (; k < l; k++) { + if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++; + } + (*rep)[j] = (*rep)[i]; + if (j < i) { + (*rep)[i] = NULL; + (*pos)[i] = 0; + (*cut)[i] = 0; + } + } + } + hyphens[j + 1] = '\0'; +#ifdef VERBOSE + printf ("nums: %s\n", hyphens); +#endif + return 0; +} + +/* get the word with all possible hyphenations (output: hyphword) */ +void hnj_hyphen_hyphword(const char * word, int word_size, const char * hyphens, + char * hyphword, char *** rep, int ** pos, int ** cut) +{ + + if (word_size <= 0 || word_size > INT_MAX / 2) { + hyphword[0] = '\0'; + return; + } + + /* hyphword buffer size must be at least 2 * l */ + int hyphword_size = 2 * word_size - 1; + + int nonstandard = 0; + if (*rep && *pos && *cut) { + nonstandard = 1; + } + + int i; + int j = 0; + for (i = 0; i < word_size && j < hyphword_size; i++) { + hyphword[j++] = word[i]; + if (hyphens[i]&1 && j < hyphword_size) { + if (nonstandard && (*rep)[i] && j >= (*pos)[i]) { + /* non-standard */ + j -= (*pos)[i]; + char *s = (*rep)[i]; + while (*s && j < hyphword_size) { + hyphword[j++] = *s++; + } + i += (*cut)[i] - (*pos)[i]; + } else { + /* standard */ + hyphword[j++] = '='; + } + } + } + hyphword[j] = '\0'; +} + + +/* main api function with default hyphenmin parameters */ +int hnj_hyphen_hyphenate2 (HyphenDict *dict, + const char *word, int word_size, char * hyphens, + char *hyphword, char *** rep, int ** pos, int ** cut) +{ + hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, + dict->clhmin, dict->crhmin, 1, 1); + hnj_hyphen_lhmin(dict->utf8, word, word_size, + hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2)); + hnj_hyphen_rhmin(dict->utf8, word, word_size, + hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2)); + + /* nohyphen */ + if (dict->nohyphen) { + char * nh = dict->nohyphen; + int nhi; + for (nhi = 0; nhi <= dict->nohyphenl; nhi++) { + char * nhy = (char *) strstr(word, nh); + while (nhy) { + hyphens[nhy - word + strlen(nh) - 1] = '0'; + if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = '0'; + nhy = (char *) strstr(nhy + 1, nh); + } + nh = nh + strlen(nh) + 1; + } + } + + if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); + if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); +#ifdef VERBOSE + printf ("nums: %s\n", hyphens); +#endif + return 0; +} + +/* previous main api function with hyphenmin parameters */ +int hnj_hyphen_hyphenate3 (HyphenDict *dict, + const char *word, int word_size, char * hyphens, + char *hyphword, char *** rep, int ** pos, int ** cut, + int lhmin, int rhmin, int clhmin, int crhmin) +{ + lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin; + rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin; + clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin; + crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin; + hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, + clhmin, crhmin, 1, 1); + hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, + rep, pos, cut, (lhmin > 0 ? lhmin : 2)); + hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, + rep, pos, cut, (rhmin > 0 ? rhmin : 2)); + if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); + + /* nohyphen */ + if (dict->nohyphen) { + char * nh = dict->nohyphen; + int nhi; + for (nhi = 0; nhi <= dict->nohyphenl; nhi++) { + char * nhy = (char *) strstr(word, nh); + while (nhy) { + hyphens[nhy - word + strlen(nh) - 1] = 0; + if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = 0; + nhy = (char *) strstr(nhy + 1, nh); + } + nh = nh + strlen(nh) + 1; + } + } + + if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); + return 0; +} diff --git a/intl/hyphenation/hyphen/hyphen.h b/intl/hyphenation/hyphen/hyphen.h new file mode 100644 index 000000000..2b4e14642 --- /dev/null +++ b/intl/hyphenation/hyphen/hyphen.h @@ -0,0 +1,175 @@ +/* Hyphen - hyphenation library using converted TeX hyphenation patterns + * + * (C) 1998 Raph Levien + * (C) 2001 ALTLinux, Moscow + * (C) 2006, 2007, 2008 László Németh + * + * This was part of libHnj library by Raph Levien. + * + * Peter Novodvorsky from ALTLinux cut hyphenation part from libHnj + * to use it in OpenOffice.org. + * + * Non-standard and compound word hyphenation support by László Németh. + * + * License is the original LibHnj license: + * + * LibHnj is dual licensed under LGPL and MPL. Boilerplate for both + * licenses follows. + */ + +/* LibHnj - a library for high quality hyphenation and justification + * Copyright (C) 1998 Raph Levien + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307 USA. +*/ + +/* + * The contents of this file are subject to the Mozilla Public License + * Version 1.0 (the "MPL"); you may not use this file except in + * compliance with the MPL. You may obtain a copy of the MPL at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the MPL is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL + * for the specific language governing rights and limitations under the + * MPL. + * + */ +#ifndef __HYPHEN_H__ +#define __HYPHEN_H__ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#include <stdio.h> + +typedef struct _HyphenDict HyphenDict; +typedef struct _HyphenState HyphenState; +typedef struct _HyphenTrans HyphenTrans; +#define MAX_CHARS 100 +#define MAX_NAME 20 + +struct _HyphenDict { + /* user options */ + char lhmin; /* lefthyphenmin: min. hyph. distance from the left side */ + char rhmin; /* righthyphenmin: min. hyph. distance from the right side */ + char clhmin; /* min. hyph. distance from the left compound boundary */ + char crhmin; /* min. hyph. distance from the right compound boundary */ + char * nohyphen; /* comma separated list of characters or character + sequences with forbidden hyphenation */ + int nohyphenl; /* count of elements in nohyphen */ + /* system variables */ + int num_states; + char cset[MAX_NAME]; + int utf8; + HyphenState *states; + HyphenDict *nextlevel; +}; + +struct _HyphenState { + char *match; + char *repl; + signed char replindex; + signed char replcut; + int fallback_state; + int num_trans; + HyphenTrans *trans; +}; + +struct _HyphenTrans { + char ch; + int new_state; +}; + +HyphenDict *hnj_hyphen_load (const char *fn); +HyphenDict *hnj_hyphen_load_file (FILE *f); +void hnj_hyphen_free (HyphenDict *dict); + +/* obsolete, use hnj_hyphen_hyphenate2() or *hyphenate3() functions) */ +int hnj_hyphen_hyphenate (HyphenDict *dict, + const char *word, int word_size, + char *hyphens); + +/* + + int hnj_hyphen_hyphenate2(): non-standard hyphenation. + + (It supports Catalan, Dutch, German, Hungarian, Norwegian, Swedish + etc. orthography, see documentation.) + + input data: + word: input word + word_size: byte length of the input word + + hyphens: allocated character buffer (size = word_size + 5) + hyphenated_word: allocated character buffer (size ~ word_size * 2) or NULL + rep, pos, cut: pointers (point to the allocated and _zeroed_ buffers + (size=word_size) or with NULL value) or NULL + + output data: + hyphens: hyphenation vector (hyphenation points signed with odd numbers) + hyphenated_word: hyphenated input word (hyphens signed with `='), + optional (NULL input) + rep: NULL (only standard hyph.), or replacements (hyphenation points + signed with `=' in replacements); + pos: NULL, or difference of the actual position and the beginning + positions of the change in input words; + cut: NULL, or counts of the removed characters of the original words + at hyphenation, + + Note: rep, pos, cut are complementary arrays to the hyphens, indexed with the + character positions of the input word. + + For example: + Schiffahrt -> Schiff=fahrt, + pattern: f1f/ff=f,1,2 + output: rep[5]="ff=f", pos[5] = 1, cut[5] = 2 + + Note: hnj_hyphen_hyphenate2() can allocate rep, pos, cut (word_size + length arrays): + + char ** rep = NULL; + int * pos = NULL; + int * cut = NULL; + char hyphens[MAXWORDLEN]; + hnj_hyphen_hyphenate2(dict, "example", 7, hyphens, NULL, &rep, &pos, &cut); + + See example in the source distribution. + +*/ + +int hnj_hyphen_hyphenate2 (HyphenDict *dict, + const char *word, int word_size, char * hyphens, + char *hyphenated_word, char *** rep, int ** pos, int ** cut); + +/* like hnj_hyphen_hyphenate2, but with hyphenmin parameters */ +/* lhmin: lefthyphenmin + * rhmin: righthyphenmin + * clhmin: compoundlefthyphemin + * crhmin: compoundrighthyphenmin + * (see documentation) */ + +int hnj_hyphen_hyphenate3 (HyphenDict *dict, + const char *word, int word_size, char * hyphens, + char *hyphword, char *** rep, int ** pos, int ** cut, + int lhmin, int rhmin, int clhmin, int crhmin); + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* __HYPHEN_H__ */ diff --git a/intl/hyphenation/hyphen/moz.build b/intl/hyphenation/hyphen/moz.build new file mode 100644 index 000000000..7c5c01024 --- /dev/null +++ b/intl/hyphenation/hyphen/moz.build @@ -0,0 +1,19 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# These files cannot be built in unified mode because they include hnjalloc.h. +SOURCES += [ + 'hyphen.c', +] + +FINAL_LIBRARY = 'xul' + +LOCAL_INCLUDES += [ + '../glue', +] + +# We allow warnings for third-party code that can be updated from upstream. +ALLOW_COMPILER_WARNINGS = True |