summaryrefslogtreecommitdiffstats
path: root/extensions/spellcheck/locales/en-US/hunspell/dictionary-sources/make-new-dict
diff options
context:
space:
mode:
Diffstat (limited to 'extensions/spellcheck/locales/en-US/hunspell/dictionary-sources/make-new-dict')
-rwxr-xr-xextensions/spellcheck/locales/en-US/hunspell/dictionary-sources/make-new-dict69
1 files changed, 69 insertions, 0 deletions
diff --git a/extensions/spellcheck/locales/en-US/hunspell/dictionary-sources/make-new-dict b/extensions/spellcheck/locales/en-US/hunspell/dictionary-sources/make-new-dict
new file mode 100755
index 000000000..012b9154e
--- /dev/null
+++ b/extensions/spellcheck/locales/en-US/hunspell/dictionary-sources/make-new-dict
@@ -0,0 +1,69 @@
+#!/bin/sh
+
+#
+# This script creates a new dictionary by expanding the original,
+# Mozilla's, and the upstream dictionary to remove affix flags and
+# then doing the wordlist equivalent of diff3 to create a new
+# dictionary.
+#
+# The files 2-mozilla-add and 2-mozilla-rem contain words added and
+# removed, receptively in the Mozilla dictionary. The final
+# dictionary will be in hunspell-en_US-mozilla.zip.
+#
+
+set -e
+
+export LANG=C
+export LC_ALL=C
+export LC_CTYPE=C
+export LC_COLLATE=C
+
+WKDIR="`pwd`"
+
+export SCOWL="$WKDIR/scowl/"
+
+ORIG="$WKDIR/orig/"
+SPELLER="$SCOWL/speller"
+
+expand() {
+ grep -v '^[0-9]\+$' | $SPELLER/munch-list expand $1 | sort -u
+}
+
+cd $SPELLER
+MK_LIST="../mk-list -v1 --accents=both en_US 60"
+cat <<EOF > params.txt
+With Input Command: $MK_LIST
+EOF
+# note: output of make-hunspell-dict is utf-8
+$MK_LIST | ./make-hunspell-dict -one en_US-custom params.txt > ./make-hunspell-dict.log
+cd $WKDIR
+
+# Note: Input and output of "expand" is always iso-8859-1.
+# All expanded word list files are thus in iso-8859-1.
+
+expand $SPELLER/en.aff < $SPELLER/en.dic.supp > 0-special # input: ASCII
+
+# input in utf-8, expand expects iso-8859-1 so use iconv
+iconv -f utf-8 -t iso-8859-1 $ORIG/en_US-custom.dic | expand $SPELLER/en_US-custom.aff > 1-base.txt
+
+expand ../en-US.aff < ../en-US.dic > 2-mozilla.txt # input: iso-8850-1
+
+# input in utf-8, expand expects iso-8859-1 so use iconv
+iconv -f utf-8 -t iso-8859-1 $SPELLER/en_US-custom.dic | expand $SPELLER/en_US-custom.aff > 3-upstream.txt
+
+comm -23 1-base.txt 2-mozilla.txt > 2-mozilla-rem
+comm -13 1-base.txt 2-mozilla.txt > 2-mozilla-add
+comm -23 3-upstream.txt 2-mozilla-rem | cat - 2-mozilla-add | sort -u > 4-patched.txt
+
+# note: output of make-hunspell-dict is utf-8
+cat 4-patched.txt | comm -23 - 0-special | $SPELLER/make-hunspell-dict -one en_US-mozilla /dev/null
+
+# sanity check should yield identical results
+#comm -23 1-base.txt 3-upstream.txt > 3-upstream-rem
+#comm -13 1-base.txt 3-upstream.txt > 3-upstream-add
+#comm -23 2-mozilla.txt 3-upstream-rem | cat - 3-upstream-add | sort -u > 4-patched-v2.txt
+
+expand ../en-US.aff < mozilla-specific.txt > 5-mozilla-specific
+
+comm -12 3-upstream.txt 2-mozilla-rem > 5-mozilla-removed
+comm -13 3-upstream.txt 2-mozilla-add > 5-mozilla-added