1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
|
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import codecs
import encodings.idna
import re
import sys
"""
Processes a file containing effective TLD data. See the following URL for a
description of effective TLDs and of the file format that this script
processes (although for the latter you're better off just reading this file's
short source code).
http://wiki.mozilla.org/Gecko:Effective_TLD_Service
"""
def getEffectiveTLDs(path):
file = codecs.open(path, "r", "UTF-8")
entries = []
domains = set()
for line in file:
# line always contains a line terminator unless the file is empty
if len(line) == 0:
raise StopIteration
line = line.rstrip()
# comment, empty, or superfluous line for explicitness purposes
if line.startswith("//") or "." not in line:
continue
line = re.split(r"[ \t\n]", line, 1)[0]
entry = EffectiveTLDEntry(line)
domain = entry.domain()
assert domain not in domains, \
"repeating domain %s makes no sense" % domain
domains.add(domain)
entries.append(entry)
# Sort the entries so we can use binary search on them.
entries.sort(key=EffectiveTLDEntry.domain)
return entries
def _normalizeHostname(domain):
"""
Normalizes the given domain, component by component. ASCII components are
lowercased, while non-ASCII components are processed using the ToASCII
algorithm.
"""
def convertLabel(label):
if _isASCII(label):
return label.lower()
return encodings.idna.ToASCII(label)
return ".".join(map(convertLabel, domain.split(".")))
def _isASCII(s):
"True if s consists entirely of ASCII characters, false otherwise."
for c in s:
if ord(c) > 127:
return False
return True
class EffectiveTLDEntry:
"""
Stores an entry in an effective-TLD name file.
"""
_exception = False
_wild = False
def __init__(self, line):
"""
Creates a TLD entry from a line of data, which must have been stripped of
the line ending.
"""
if line.startswith("!"):
self._exception = True
domain = line[1:]
elif line.startswith("*."):
self._wild = True
domain = line[2:]
else:
domain = line
self._domain = _normalizeHostname(domain)
def domain(self):
"The domain this represents."
return self._domain
def exception(self):
"True if this entry's domain denotes does not denote an effective TLD."
return self._exception
def wild(self):
"True if this entry represents a class of effective TLDs."
return self._wild
#################
# DO EVERYTHING #
#################
def main(output, effective_tld_filename):
"""
effective_tld_filename is the effective TLD file to parse.
A C++ array of { domain, exception, wild } entries representing the
eTLD file is then printed to output.
"""
def boolStr(b):
if b:
return "true"
return "false"
for etld in getEffectiveTLDs(effective_tld_filename):
exception = boolStr(etld.exception())
wild = boolStr(etld.wild())
output.write('ETLD_ENTRY("%s", %s, %s)\n' % (etld.domain(), exception, wild))
if __name__ == '__main__':
main(sys.stdout, sys.argv[1])
|