summaryrefslogtreecommitdiffstats
path: root/extensions/spellcheck/locales/en-US/hunspell/dictionary-sources/make-new-dict
blob: 012b9154e257a8e87a2e95598c68bdbb1de55e2f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/bin/sh

#
# This script creates a new dictionary by expanding the original,
# Mozilla's, and the upstream dictionary to remove affix flags and
# then doing the wordlist equivalent of diff3 to create a new
# dictionary.
#
# The files 2-mozilla-add and 2-mozilla-rem contain words added and
# removed, receptively in the Mozilla dictionary.  The final
# dictionary will be in hunspell-en_US-mozilla.zip.
#

set -e

export LANG=C
export LC_ALL=C
export LC_CTYPE=C
export LC_COLLATE=C

WKDIR="`pwd`"

export SCOWL="$WKDIR/scowl/"

ORIG="$WKDIR/orig/"
SPELLER="$SCOWL/speller"

expand() {
  grep -v '^[0-9]\+$' | $SPELLER/munch-list expand $1 | sort -u
}

cd $SPELLER
MK_LIST="../mk-list -v1 --accents=both en_US 60"
cat <<EOF > params.txt
With Input Command: $MK_LIST
EOF
# note: output of make-hunspell-dict is utf-8
$MK_LIST | ./make-hunspell-dict -one en_US-custom params.txt > ./make-hunspell-dict.log
cd $WKDIR

# Note: Input and output of "expand" is always iso-8859-1.
#       All expanded word list files are thus in iso-8859-1.

expand $SPELLER/en.aff < $SPELLER/en.dic.supp > 0-special # input: ASCII

# input in utf-8, expand expects iso-8859-1 so use iconv
iconv -f utf-8 -t iso-8859-1 $ORIG/en_US-custom.dic | expand $SPELLER/en_US-custom.aff > 1-base.txt

expand ../en-US.aff < ../en-US.dic > 2-mozilla.txt # input: iso-8850-1

# input in utf-8, expand expects iso-8859-1 so use iconv
iconv -f utf-8 -t iso-8859-1 $SPELLER/en_US-custom.dic | expand $SPELLER/en_US-custom.aff > 3-upstream.txt 

comm -23 1-base.txt 2-mozilla.txt > 2-mozilla-rem
comm -13 1-base.txt 2-mozilla.txt > 2-mozilla-add
comm -23 3-upstream.txt 2-mozilla-rem | cat - 2-mozilla-add | sort -u > 4-patched.txt

# note: output of make-hunspell-dict is utf-8
cat 4-patched.txt | comm -23 - 0-special | $SPELLER/make-hunspell-dict -one en_US-mozilla /dev/null

# sanity check should yield identical results
#comm -23 1-base.txt 3-upstream.txt > 3-upstream-rem
#comm -13 1-base.txt 3-upstream.txt > 3-upstream-add
#comm -23 2-mozilla.txt 3-upstream-rem | cat - 3-upstream-add | sort -u > 4-patched-v2.txt

expand ../en-US.aff < mozilla-specific.txt > 5-mozilla-specific

comm -12 3-upstream.txt 2-mozilla-rem > 5-mozilla-removed
comm -13 3-upstream.txt 2-mozilla-add > 5-mozilla-added