diff options
author | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
---|---|---|
committer | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
commit | 5f8de423f190bbb79a62f804151bc24824fa32d8 (patch) | |
tree | 10027f336435511475e392454359edea8e25895d /python/compare-locales/compare_locales/parser.py | |
parent | 49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff) | |
download | UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip |
Add m-esr52 at 52.6.0
Diffstat (limited to 'python/compare-locales/compare_locales/parser.py')
-rw-r--r-- | python/compare-locales/compare_locales/parser.py | 521 |
1 files changed, 521 insertions, 0 deletions
diff --git a/python/compare-locales/compare_locales/parser.py b/python/compare-locales/compare_locales/parser.py new file mode 100644 index 000000000..a97cf201b --- /dev/null +++ b/python/compare-locales/compare_locales/parser.py @@ -0,0 +1,521 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import re +import codecs +import logging +from HTMLParser import HTMLParser + +__constructors = [] + + +class Entity(object): + ''' + Abstraction layer for a localizable entity. + Currently supported are grammars of the form: + + 1: pre white space + 2: pre comments + 3: entity definition + 4: entity key (name) + 5: entity value + 6: post comment (and white space) in the same line (dtd only) + <--[1] + <!-- pre comments --> <--[2] + <!ENTITY key "value"> <!-- comment --> + + <-------[3]---------><------[6]------> + ''' + def __init__(self, contents, pp, + span, pre_ws_span, pre_comment_span, def_span, + key_span, val_span, post_span): + self.contents = contents + self.span = span + self.pre_ws_span = pre_ws_span + self.pre_comment_span = pre_comment_span + self.def_span = def_span + self.key_span = key_span + self.val_span = val_span + self.post_span = post_span + self.pp = pp + pass + + # getter helpers + + def get_all(self): + return self.contents[self.span[0]:self.span[1]] + + def get_pre_ws(self): + return self.contents[self.pre_ws_span[0]:self.pre_ws_span[1]] + + def get_pre_comment(self): + return self.contents[self.pre_comment_span[0]: + self.pre_comment_span[1]] + + def get_def(self): + return self.contents[self.def_span[0]:self.def_span[1]] + + def get_key(self): + return self.contents[self.key_span[0]:self.key_span[1]] + + def get_val(self): + return self.pp(self.contents[self.val_span[0]:self.val_span[1]]) + + def get_raw_val(self): + return self.contents[self.val_span[0]:self.val_span[1]] + + def get_post(self): + return self.contents[self.post_span[0]:self.post_span[1]] + + # getters + + all = property(get_all) + pre_ws = property(get_pre_ws) + pre_comment = property(get_pre_comment) + definition = property(get_def) + key = property(get_key) + val = property(get_val) + raw_val = property(get_raw_val) + post = property(get_post) + + def __repr__(self): + return self.key + + +class Junk(object): + ''' + An almost-Entity, representing junk data that we didn't parse. + This way, we can signal bad content as stuff we don't understand. + And the either fix that, or report real bugs in localizations. + ''' + junkid = 0 + + def __init__(self, contents, span): + self.contents = contents + self.span = span + self.pre_ws = self.pre_comment = self.definition = self.post = '' + self.__class__.junkid += 1 + self.key = '_junk_%d_%d-%d' % (self.__class__.junkid, span[0], span[1]) + + # getter helpers + def get_all(self): + return self.contents[self.span[0]:self.span[1]] + + # getters + all = property(get_all) + val = property(get_all) + + def __repr__(self): + return self.key + + +class Parser: + canMerge = True + + def __init__(self): + if not hasattr(self, 'encoding'): + self.encoding = 'utf-8' + pass + + def readFile(self, file): + f = codecs.open(file, 'r', self.encoding) + try: + self.contents = f.read() + except UnicodeDecodeError, e: + (logging.getLogger('locales') + .error("Can't read file: " + file + '; ' + str(e))) + self.contents = u'' + f.close() + + def readContents(self, contents): + (self.contents, length) = codecs.getdecoder(self.encoding)(contents) + + def parse(self): + l = [] + m = {} + for e in self: + m[e.key] = len(l) + l.append(e) + return (l, m) + + def postProcessValue(self, val): + return val + + def __iter__(self): + contents = self.contents + offset = 0 + self.header, offset = self.getHeader(contents, offset) + self.footer = '' + entity, offset = self.getEntity(contents, offset) + while entity: + yield entity + entity, offset = self.getEntity(contents, offset) + f = self.reFooter.match(contents, offset) + if f: + self.footer = f.group() + offset = f.end() + if len(contents) > offset: + yield Junk(contents, (offset, len(contents))) + pass + + def getHeader(self, contents, offset): + header = '' + h = self.reHeader.match(contents) + if h: + header = h.group() + offset = h.end() + return (header, offset) + + def getEntity(self, contents, offset): + m = self.reKey.match(contents, offset) + if m: + offset = m.end() + entity = self.createEntity(contents, m) + return (entity, offset) + # first check if footer has a non-empty match, + # 'cause then we don't find junk + m = self.reFooter.match(contents, offset) + if m and m.end() > offset: + return (None, offset) + m = self.reKey.search(contents, offset) + if m: + # we didn't match, but search, so there's junk between offset + # and start. We'll match() on the next turn + junkend = m.start() + return (Junk(contents, (offset, junkend)), junkend) + return (None, offset) + + def createEntity(self, contents, m): + return Entity(contents, self.postProcessValue, + *[m.span(i) for i in xrange(7)]) + + +def getParser(path): + for item in __constructors: + if re.search(item[0], path): + return item[1] + raise UserWarning("Cannot find Parser") + + +# Subgroups of the match will: +# 1: pre white space +# 2: pre comments +# 3: entity definition +# 4: entity key (name) +# 5: entity value +# 6: post comment (and white space) in the same line (dtd only) +# <--[1] +# <!-- pre comments --> <--[2] +# <!ENTITY key "value"> <!-- comment --> +# +# <-------[3]---------><------[6]------> + + +class DTDParser(Parser): + # http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar + # ":" | [A-Z] | "_" | [a-z] | + # [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] + # | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | + # [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | + # [#x10000-#xEFFFF] + CharMinusDash = u'\x09\x0A\x0D\u0020-\u002C\u002E-\uD7FF\uE000-\uFFFD' + XmlComment = '<!--(?:-?[%s])*?-->' % CharMinusDash + NameStartChar = u':A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\u02FF' + \ + u'\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F' + \ + u'\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD' + # + \U00010000-\U000EFFFF seems to be unsupported in python + + # NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | + # [#x0300-#x036F] | [#x203F-#x2040] + NameChar = NameStartChar + ur'\-\.0-9' + u'\xB7\u0300-\u036F\u203F-\u2040' + Name = '[' + NameStartChar + '][' + NameChar + ']*' + reKey = re.compile('(?:(?P<pre>\s*)(?P<precomment>(?:' + XmlComment + + '\s*)*)(?P<entity><!ENTITY\s+(?P<key>' + Name + + ')\s+(?P<val>\"[^\"]*\"|\'[^\']*\'?)\s*>)' + '(?P<post>[ \t]*(?:' + XmlComment + '\s*)*\n?)?)', + re.DOTALL) + # add BOM to DTDs, details in bug 435002 + reHeader = re.compile(u'^\ufeff?' + u'(\s*<!--.*(http://mozilla.org/MPL/2.0/|' + u'LICENSE BLOCK)([^-]+-)*[^-]+-->)?', re.S) + reFooter = re.compile('\s*(<!--([^-]+-)*[^-]+-->\s*)*$') + rePE = re.compile('(?:(\s*)((?:' + XmlComment + '\s*)*)' + '(<!ENTITY\s+%\s+(' + Name + + ')\s+SYSTEM\s+(\"[^\"]*\"|\'[^\']*\')\s*>\s*%' + Name + + ';)([ \t]*(?:' + XmlComment + '\s*)*\n?)?)') + + def getEntity(self, contents, offset): + ''' + Overload Parser.getEntity to special-case ParsedEntities. + Just check for a parsed entity if that method claims junk. + + <!ENTITY % foo SYSTEM "url"> + %foo; + ''' + entity, inneroffset = Parser.getEntity(self, contents, offset) + if (entity and isinstance(entity, Junk)) or entity is None: + m = self.rePE.match(contents, offset) + if m: + inneroffset = m.end() + entity = Entity(contents, self.postProcessValue, + *[m.span(i) for i in xrange(7)]) + return (entity, inneroffset) + + def createEntity(self, contents, m): + valspan = m.span('val') + valspan = (valspan[0]+1, valspan[1]-1) + return Entity(contents, self.postProcessValue, m.span(), + m.span('pre'), m.span('precomment'), + m.span('entity'), m.span('key'), valspan, + m.span('post')) + + +class PropertiesParser(Parser): + escape = re.compile(r'\\((?P<uni>u[0-9a-fA-F]{1,4})|' + '(?P<nl>\n\s*)|(?P<single>.))', re.M) + known_escapes = {'n': '\n', 'r': '\r', 't': '\t', '\\': '\\'} + + def __init__(self): + self.reKey = re.compile('^(\s*)' + '((?:[#!].*?\n\s*)*)' + '([^#!\s\n][^=:\n]*?)\s*[:=][ \t]*', re.M) + self.reHeader = re.compile('^\s*([#!].*\s*)+') + self.reFooter = re.compile('\s*([#!].*\s*)*$') + self._escapedEnd = re.compile(r'\\+$') + self._trailingWS = re.compile(r'[ \t]*$') + Parser.__init__(self) + + def getHeader(self, contents, offset): + header = '' + h = self.reHeader.match(contents, offset) + if h: + candidate = h.group() + if 'http://mozilla.org/MPL/2.0/' in candidate or \ + 'LICENSE BLOCK' in candidate: + header = candidate + offset = h.end() + return (header, offset) + + def getEntity(self, contents, offset): + # overwritten to parse values line by line + m = self.reKey.match(contents, offset) + if m: + offset = m.end() + while True: + endval = nextline = contents.find('\n', offset) + if nextline == -1: + endval = offset = len(contents) + break + # is newline escaped? + _e = self._escapedEnd.search(contents, offset, nextline) + offset = nextline + 1 + if _e is None: + break + # backslashes at end of line, if 2*n, not escaped + if len(_e.group()) % 2 == 0: + break + # strip trailing whitespace + ws = self._trailingWS.search(contents, m.end(), offset) + if ws: + endval -= ws.end() - ws.start() + entity = Entity(contents, self.postProcessValue, + (m.start(), offset), # full span + m.span(1), # leading whitespan + m.span(2), # leading comment span + (m.start(3), offset), # entity def span + m.span(3), # key span + (m.end(), endval), # value span + (offset, offset)) # post comment span, empty + return (entity, offset) + m = self.reKey.search(contents, offset) + if m: + # we didn't match, but search, so there's junk between offset + # and start. We'll match() on the next turn + junkend = m.start() + return (Junk(contents, (offset, junkend)), junkend) + return (None, offset) + + def postProcessValue(self, val): + + def unescape(m): + found = m.groupdict() + if found['uni']: + return unichr(int(found['uni'][1:], 16)) + if found['nl']: + return '' + return self.known_escapes.get(found['single'], found['single']) + val = self.escape.sub(unescape, val) + return val + + +class DefinesParser(Parser): + # can't merge, #unfilter needs to be the last item, which we don't support + canMerge = False + + def __init__(self): + self.reKey = re.compile('^(\s*)((?:^#(?!define\s).*\s*)*)' + '(#define[ \t]+(\w+)[ \t]+(.*?))([ \t]*$\n?)', + re.M) + self.reHeader = re.compile('^\s*(#(?!define\s).*\s*)*') + self.reFooter = re.compile('\s*(#(?!define\s).*\s*)*$', re.M) + Parser.__init__(self) + + +class IniParser(Parser): + ''' + Parse files of the form: + # initial comment + [cat] + whitespace* + #comment + string=value + ... + ''' + def __init__(self): + self.reHeader = re.compile('^((?:\s*|[;#].*)\n)*\[.+?\]\n', re.M) + self.reKey = re.compile('(\s*)((?:[;#].*\n\s*)*)((.+?)=(.*))(\n?)') + self.reFooter = re.compile('\s*([;#].*\s*)*$') + Parser.__init__(self) + + +DECL, COMMENT, START, END, CONTENT = range(5) + + +class BookmarksParserInner(HTMLParser): + + class Token(object): + _type = None + content = '' + + def __str__(self): + return self.content + + class DeclToken(Token): + _type = DECL + + def __init__(self, decl): + self.content = decl + pass + + def __str__(self): + return '<!%s>' % self.content + pass + + class CommentToken(Token): + _type = COMMENT + + def __init__(self, comment): + self.content = comment + pass + + def __str__(self): + return '<!--%s-->' % self.content + pass + + class StartToken(Token): + _type = START + + def __init__(self, tag, attrs, content): + self.tag = tag + self.attrs = dict(attrs) + self.content = content + pass + pass + + class EndToken(Token): + _type = END + + def __init__(self, tag): + self.tag = tag + pass + + def __str__(self): + return '</%s>' % self.tag.upper() + pass + + class ContentToken(Token): + _type = CONTENT + + def __init__(self, content): + self.content = content + pass + pass + + def __init__(self): + HTMLParser.__init__(self) + self.tokens = [] + + def parse(self, contents): + self.tokens = [] + self.feed(contents) + self.close() + return self.tokens + + # Called when we hit an end DL tag to reset the folder selections + def handle_decl(self, decl): + self.tokens.append(self.DeclToken(decl)) + + # Called when we hit an end DL tag to reset the folder selections + def handle_comment(self, comment): + self.tokens.append(self.CommentToken(comment)) + + def handle_starttag(self, tag, attrs): + self.tokens.append(self.StartToken(tag, attrs, + self.get_starttag_text())) + + # Called when text data is encountered + def handle_data(self, data): + if self.tokens[-1]._type == CONTENT: + self.tokens[-1].content += data + else: + self.tokens.append(self.ContentToken(data)) + + def handle_charref(self, data): + self.handle_data('&#%s;' % data) + + def handle_entityref(self, data): + self.handle_data('&%s;' % data) + + # Called when we hit an end DL tag to reset the folder selections + def handle_endtag(self, tag): + self.tokens.append(self.EndToken(tag)) + + +class BookmarksParser(Parser): + canMerge = False + + class BMEntity(object): + def __init__(self, key, val): + self.key = key + self.val = val + + def __iter__(self): + p = BookmarksParserInner() + tks = p.parse(self.contents) + i = 0 + k = [] + for i in xrange(len(tks)): + t = tks[i] + if t._type == START: + k.append(t.tag) + keys = t.attrs.keys() + keys.sort() + for attrname in keys: + yield self.BMEntity('.'.join(k) + '.@' + attrname, + t.attrs[attrname]) + if i + 1 < len(tks) and tks[i+1]._type == CONTENT: + i += 1 + t = tks[i] + v = t.content.strip() + if v: + yield self.BMEntity('.'.join(k), v) + elif t._type == END: + k.pop() + + +__constructors = [('\\.dtd$', DTDParser()), + ('\\.properties$', PropertiesParser()), + ('\\.ini$', IniParser()), + ('\\.inc$', DefinesParser()), + ('bookmarks\\.html$', BookmarksParser())] |