diff options
Diffstat (limited to 'extensions/spellcheck/locales/en-US/hunspell/dictionary-sources/make-new-dict')
-rwxr-xr-x | extensions/spellcheck/locales/en-US/hunspell/dictionary-sources/make-new-dict | 69 |
1 files changed, 69 insertions, 0 deletions
diff --git a/extensions/spellcheck/locales/en-US/hunspell/dictionary-sources/make-new-dict b/extensions/spellcheck/locales/en-US/hunspell/dictionary-sources/make-new-dict new file mode 100755 index 000000000..012b9154e --- /dev/null +++ b/extensions/spellcheck/locales/en-US/hunspell/dictionary-sources/make-new-dict @@ -0,0 +1,69 @@ +#!/bin/sh + +# +# This script creates a new dictionary by expanding the original, +# Mozilla's, and the upstream dictionary to remove affix flags and +# then doing the wordlist equivalent of diff3 to create a new +# dictionary. +# +# The files 2-mozilla-add and 2-mozilla-rem contain words added and +# removed, receptively in the Mozilla dictionary. The final +# dictionary will be in hunspell-en_US-mozilla.zip. +# + +set -e + +export LANG=C +export LC_ALL=C +export LC_CTYPE=C +export LC_COLLATE=C + +WKDIR="`pwd`" + +export SCOWL="$WKDIR/scowl/" + +ORIG="$WKDIR/orig/" +SPELLER="$SCOWL/speller" + +expand() { + grep -v '^[0-9]\+$' | $SPELLER/munch-list expand $1 | sort -u +} + +cd $SPELLER +MK_LIST="../mk-list -v1 --accents=both en_US 60" +cat <<EOF > params.txt +With Input Command: $MK_LIST +EOF +# note: output of make-hunspell-dict is utf-8 +$MK_LIST | ./make-hunspell-dict -one en_US-custom params.txt > ./make-hunspell-dict.log +cd $WKDIR + +# Note: Input and output of "expand" is always iso-8859-1. +# All expanded word list files are thus in iso-8859-1. + +expand $SPELLER/en.aff < $SPELLER/en.dic.supp > 0-special # input: ASCII + +# input in utf-8, expand expects iso-8859-1 so use iconv +iconv -f utf-8 -t iso-8859-1 $ORIG/en_US-custom.dic | expand $SPELLER/en_US-custom.aff > 1-base.txt + +expand ../en-US.aff < ../en-US.dic > 2-mozilla.txt # input: iso-8850-1 + +# input in utf-8, expand expects iso-8859-1 so use iconv +iconv -f utf-8 -t iso-8859-1 $SPELLER/en_US-custom.dic | expand $SPELLER/en_US-custom.aff > 3-upstream.txt + +comm -23 1-base.txt 2-mozilla.txt > 2-mozilla-rem +comm -13 1-base.txt 2-mozilla.txt > 2-mozilla-add +comm -23 3-upstream.txt 2-mozilla-rem | cat - 2-mozilla-add | sort -u > 4-patched.txt + +# note: output of make-hunspell-dict is utf-8 +cat 4-patched.txt | comm -23 - 0-special | $SPELLER/make-hunspell-dict -one en_US-mozilla /dev/null + +# sanity check should yield identical results +#comm -23 1-base.txt 3-upstream.txt > 3-upstream-rem +#comm -13 1-base.txt 3-upstream.txt > 3-upstream-add +#comm -23 2-mozilla.txt 3-upstream-rem | cat - 3-upstream-add | sort -u > 4-patched-v2.txt + +expand ../en-US.aff < mozilla-specific.txt > 5-mozilla-specific + +comm -12 3-upstream.txt 2-mozilla-rem > 5-mozilla-removed +comm -13 3-upstream.txt 2-mozilla-add > 5-mozilla-added |