diff options
Diffstat (limited to 'netwerk/dns/prepare_tlds.py')
-rw-r--r-- | netwerk/dns/prepare_tlds.py | 121 |
1 files changed, 121 insertions, 0 deletions
diff --git a/netwerk/dns/prepare_tlds.py b/netwerk/dns/prepare_tlds.py new file mode 100644 index 000000000..a97b20948 --- /dev/null +++ b/netwerk/dns/prepare_tlds.py @@ -0,0 +1,121 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import codecs +import encodings.idna +import re +import sys + +""" +Processes a file containing effective TLD data. See the following URL for a +description of effective TLDs and of the file format that this script +processes (although for the latter you're better off just reading this file's +short source code). + +http://wiki.mozilla.org/Gecko:Effective_TLD_Service +""" + +def getEffectiveTLDs(path): + file = codecs.open(path, "r", "UTF-8") + entries = [] + domains = set() + for line in file: + # line always contains a line terminator unless the file is empty + if len(line) == 0: + raise StopIteration + line = line.rstrip() + # comment, empty, or superfluous line for explicitness purposes + if line.startswith("//") or "." not in line: + continue + line = re.split(r"[ \t\n]", line, 1)[0] + entry = EffectiveTLDEntry(line) + domain = entry.domain() + assert domain not in domains, \ + "repeating domain %s makes no sense" % domain + domains.add(domain) + entries.append(entry) + + # Sort the entries so we can use binary search on them. + entries.sort(key=EffectiveTLDEntry.domain) + + return entries + +def _normalizeHostname(domain): + """ + Normalizes the given domain, component by component. ASCII components are + lowercased, while non-ASCII components are processed using the ToASCII + algorithm. + """ + def convertLabel(label): + if _isASCII(label): + return label.lower() + return encodings.idna.ToASCII(label) + return ".".join(map(convertLabel, domain.split("."))) + +def _isASCII(s): + "True if s consists entirely of ASCII characters, false otherwise." + for c in s: + if ord(c) > 127: + return False + return True + +class EffectiveTLDEntry: + """ + Stores an entry in an effective-TLD name file. + """ + + _exception = False + _wild = False + + def __init__(self, line): + """ + Creates a TLD entry from a line of data, which must have been stripped of + the line ending. + """ + if line.startswith("!"): + self._exception = True + domain = line[1:] + elif line.startswith("*."): + self._wild = True + domain = line[2:] + else: + domain = line + self._domain = _normalizeHostname(domain) + + def domain(self): + "The domain this represents." + return self._domain + + def exception(self): + "True if this entry's domain denotes does not denote an effective TLD." + return self._exception + + def wild(self): + "True if this entry represents a class of effective TLDs." + return self._wild + + +################# +# DO EVERYTHING # +################# + +def main(output, effective_tld_filename): + """ + effective_tld_filename is the effective TLD file to parse. + A C++ array of { domain, exception, wild } entries representing the + eTLD file is then printed to output. + """ + + def boolStr(b): + if b: + return "true" + return "false" + + for etld in getEffectiveTLDs(effective_tld_filename): + exception = boolStr(etld.exception()) + wild = boolStr(etld.wild()) + output.write('ETLD_ENTRY("%s", %s, %s)\n' % (etld.domain(), exception, wild)) + +if __name__ == '__main__': + main(sys.stdout, sys.argv[1]) |