1 files changed, 521 insertions, 0 deletions
diff --git a/python/compare-locales/compare_locales/parser.py b/python/compare-locales/compare_locales/parser.py
new file mode 100644
index 000000000..a97cf201b
--- /dev/null
+++ b/python/compare-locales/compare_locales/parser.py
@@ -0,0 +1,521 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import re
+import codecs
+import logging
+from HTMLParser import HTMLParser
+
+__constructors = []
+
+
+class Entity(object):
+    '''
+    Abstraction layer for a localizable entity.
+    Currently supported are grammars of the form:
+
+    1: pre white space
+    2: pre comments
+    3: entity definition
+    4: entity key (name)
+    5: entity value
+    6: post comment (and white space) in the same line (dtd only)
+                                                 <--[1]
+    <!-- pre comments -->                        <--[2]
+    <!ENTITY key "value"> <!-- comment -->
+
+    <-------[3]---------><------[6]------>
+    '''
+    def __init__(self, contents, pp,
+                 span, pre_ws_span, pre_comment_span, def_span,
+                 key_span, val_span, post_span):
+        self.contents = contents
+        self.span = span
+        self.pre_ws_span = pre_ws_span
+        self.pre_comment_span = pre_comment_span
+        self.def_span = def_span
+        self.key_span = key_span
+        self.val_span = val_span
+        self.post_span = post_span
+        self.pp = pp
+        pass
+
+    # getter helpers
+
+    def get_all(self):
+        return self.contents[self.span[0]:self.span[1]]
+
+    def get_pre_ws(self):
+        return self.contents[self.pre_ws_span[0]:self.pre_ws_span[1]]
+
+    def get_pre_comment(self):
+        return self.contents[self.pre_comment_span[0]:
+                             self.pre_comment_span[1]]
+
+    def get_def(self):
+        return self.contents[self.def_span[0]:self.def_span[1]]
+
+    def get_key(self):
+        return self.contents[self.key_span[0]:self.key_span[1]]
+
+    def get_val(self):
+        return self.pp(self.contents[self.val_span[0]:self.val_span[1]])
+
+    def get_raw_val(self):
+        return self.contents[self.val_span[0]:self.val_span[1]]
+
+    def get_post(self):
+        return self.contents[self.post_span[0]:self.post_span[1]]
+
+    # getters
+
+    all = property(get_all)
+    pre_ws = property(get_pre_ws)
+    pre_comment = property(get_pre_comment)
+    definition = property(get_def)
+    key = property(get_key)
+    val = property(get_val)
+    raw_val = property(get_raw_val)
+    post = property(get_post)
+
+    def __repr__(self):
+        return self.key
+
+
+class Junk(object):
+    '''
+    An almost-Entity, representing junk data that we didn't parse.
+    This way, we can signal bad content as stuff we don't understand.
+    And the either fix that, or report real bugs in localizations.
+    '''
+    junkid = 0
+
+    def __init__(self, contents, span):
+        self.contents = contents
+        self.span = span
+        self.pre_ws = self.pre_comment = self.definition = self.post = ''
+        self.__class__.junkid += 1
+        self.key = '_junk_%d_%d-%d' % (self.__class__.junkid, span[0], span[1])
+
+    # getter helpers
+    def get_all(self):
+        return self.contents[self.span[0]:self.span[1]]
+
+    # getters
+    all = property(get_all)
+    val = property(get_all)
+
+    def __repr__(self):
+        return self.key
+
+
+class Parser:
+    canMerge = True
+
+    def __init__(self):
+        if not hasattr(self, 'encoding'):
+            self.encoding = 'utf-8'
+        pass
+
+    def readFile(self, file):
+        f = codecs.open(file, 'r', self.encoding)
+        try:
+            self.contents = f.read()
+        except UnicodeDecodeError, e:
+            (logging.getLogger('locales')
+                    .error("Can't read file: " + file + '; ' + str(e)))
+            self.contents = u''
+        f.close()
+
+    def readContents(self, contents):
+        (self.contents, length) = codecs.getdecoder(self.encoding)(contents)
+
+    def parse(self):
+        l = []
+        m = {}
+        for e in self:
+            m[e.key] = len(l)
+            l.append(e)
+        return (l, m)
+
+    def postProcessValue(self, val):
+        return val
+
+    def __iter__(self):
+        contents = self.contents
+        offset = 0
+        self.header, offset = self.getHeader(contents, offset)
+        self.footer = ''
+        entity, offset = self.getEntity(contents, offset)
+        while entity:
+            yield entity
+            entity, offset = self.getEntity(contents, offset)
+        f = self.reFooter.match(contents, offset)
+        if f:
+            self.footer = f.group()
+            offset = f.end()
+        if len(contents) > offset:
+            yield Junk(contents, (offset, len(contents)))
+        pass
+
+    def getHeader(self, contents, offset):
+        header = ''
+        h = self.reHeader.match(contents)
+        if h:
+            header = h.group()
+            offset = h.end()
+        return (header, offset)
+
+    def getEntity(self, contents, offset):
+        m = self.reKey.match(contents, offset)
+        if m:
+            offset = m.end()
+            entity = self.createEntity(contents, m)
+            return (entity, offset)
+        # first check if footer has a non-empty match,
+        # 'cause then we don't find junk
+        m = self.reFooter.match(contents, offset)
+        if m and m.end() > offset:
+            return (None, offset)
+        m = self.reKey.search(contents, offset)
+        if m:
+            # we didn't match, but search, so there's junk between offset
+            # and start. We'll match() on the next turn
+            junkend = m.start()
+            return (Junk(contents, (offset, junkend)), junkend)
+        return (None, offset)
+
+    def createEntity(self, contents, m):
+        return Entity(contents, self.postProcessValue,
+                      *[m.span(i) for i in xrange(7)])
+
+
+def getParser(path):
+    for item in __constructors:
+        if re.search(item[0], path):
+            return item[1]
+    raise UserWarning("Cannot find Parser")
+
+
+# Subgroups of the match will:
+# 1: pre white space
+# 2: pre comments
+# 3: entity definition
+# 4: entity key (name)
+# 5: entity value
+# 6: post comment (and white space) in the same line (dtd only)
+#                                            <--[1]
+# <!-- pre comments -->                      <--[2]
+# <!ENTITY key "value"> <!-- comment -->
+#
+# <-------[3]---------><------[6]------>
+
+
+class DTDParser(Parser):
+    # http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar
+    # ":" | [A-Z] | "_" | [a-z] |
+    # [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
+    # | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
+    # [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
+    # [#x10000-#xEFFFF]
+    CharMinusDash = u'\x09\x0A\x0D\u0020-\u002C\u002E-\uD7FF\uE000-\uFFFD'
+    XmlComment = '<!--(?:-?[%s])*?-->' % CharMinusDash
+    NameStartChar = u':A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\u02FF' + \
+        u'\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F' + \
+        u'\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD'
+    # + \U00010000-\U000EFFFF seems to be unsupported in python
+
+    # NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
+    #     [#x0300-#x036F] | [#x203F-#x2040]
+    NameChar = NameStartChar + ur'\-\.0-9' + u'\xB7\u0300-\u036F\u203F-\u2040'
+    Name = '[' + NameStartChar + '][' + NameChar + ']*'
+    reKey = re.compile('(?:(?P<pre>\s*)(?P<precomment>(?:' + XmlComment +
+                       '\s*)*)(?P<entity><!ENTITY\s+(?P<key>' + Name +
+                       ')\s+(?P<val>\"[^\"]*\"|\'[^\']*\'?)\s*>)'
+                       '(?P<post>[ \t]*(?:' + XmlComment + '\s*)*\n?)?)',
+                       re.DOTALL)
+    # add BOM to DTDs, details in bug 435002
+    reHeader = re.compile(u'^\ufeff?'
+                          u'(\s*<!--.*(http://mozilla.org/MPL/2.0/|'
+                          u'LICENSE BLOCK)([^-]+-)*[^-]+-->)?', re.S)
+    reFooter = re.compile('\s*(<!--([^-]+-)*[^-]+-->\s*)*$')
+    rePE = re.compile('(?:(\s*)((?:' + XmlComment + '\s*)*)'
+                      '(<!ENTITY\s+%\s+(' + Name +
+                      ')\s+SYSTEM\s+(\"[^\"]*\"|\'[^\']*\')\s*>\s*%' + Name +
+                      ';)([ \t]*(?:' + XmlComment + '\s*)*\n?)?)')
+
+    def getEntity(self, contents, offset):
+        '''
+        Overload Parser.getEntity to special-case ParsedEntities.
+        Just check for a parsed entity if that method claims junk.
+
+        <!ENTITY % foo SYSTEM "url">
+        %foo;
+        '''
+        entity, inneroffset = Parser.getEntity(self, contents, offset)
+        if (entity and isinstance(entity, Junk)) or entity is None:
+            m = self.rePE.match(contents, offset)
+            if m:
+                inneroffset = m.end()
+                entity = Entity(contents, self.postProcessValue,
+                                *[m.span(i) for i in xrange(7)])
+        return (entity, inneroffset)
+
+    def createEntity(self, contents, m):
+        valspan = m.span('val')
+        valspan = (valspan[0]+1, valspan[1]-1)
+        return Entity(contents, self.postProcessValue, m.span(),
+                      m.span('pre'), m.span('precomment'),
+                      m.span('entity'), m.span('key'), valspan,
+                      m.span('post'))
+
+
+class PropertiesParser(Parser):
+    escape = re.compile(r'\\((?P<uni>u[0-9a-fA-F]{1,4})|'
+                        '(?P<nl>\n\s*)|(?P<single>.))', re.M)
+    known_escapes = {'n': '\n', 'r': '\r', 't': '\t', '\\': '\\'}
+
+    def __init__(self):
+        self.reKey = re.compile('^(\s*)'
+                                '((?:[#!].*?\n\s*)*)'
+                                '([^#!\s\n][^=:\n]*?)\s*[:=][ \t]*', re.M)
+        self.reHeader = re.compile('^\s*([#!].*\s*)+')
+        self.reFooter = re.compile('\s*([#!].*\s*)*$')
+        self._escapedEnd = re.compile(r'\\+$')
+        self._trailingWS = re.compile(r'[ \t]*$')
+        Parser.__init__(self)
+
+    def getHeader(self, contents, offset):
+        header = ''
+        h = self.reHeader.match(contents, offset)
+        if h:
+            candidate = h.group()
+            if 'http://mozilla.org/MPL/2.0/' in candidate or \
+                    'LICENSE BLOCK' in candidate:
+                header = candidate
+                offset = h.end()
+        return (header, offset)
+
+    def getEntity(self, contents, offset):
+        # overwritten to parse values line by line
+        m = self.reKey.match(contents, offset)
+        if m:
+            offset = m.end()
+            while True:
+                endval = nextline = contents.find('\n', offset)
+                if nextline == -1:
+                    endval = offset = len(contents)
+                    break
+                # is newline escaped?
+                _e = self._escapedEnd.search(contents, offset, nextline)
+                offset = nextline + 1
+                if _e is None:
+                    break
+                # backslashes at end of line, if 2*n, not escaped
+                if len(_e.group()) % 2 == 0:
+                    break
+            # strip trailing whitespace
+            ws = self._trailingWS.search(contents, m.end(), offset)
+            if ws:
+                endval -= ws.end() - ws.start()
+            entity = Entity(contents, self.postProcessValue,
+                            (m.start(), offset),   # full span
+                            m.span(1),  # leading whitespan
+                            m.span(2),  # leading comment span
+                            (m.start(3), offset),   # entity def span
+                            m.span(3),   # key span
+                            (m.end(), endval),   # value span
+                            (offset, offset))  # post comment span, empty
+            return (entity, offset)
+        m = self.reKey.search(contents, offset)
+        if m:
+            # we didn't match, but search, so there's junk between offset
+            # and start. We'll match() on the next turn
+            junkend = m.start()
+            return (Junk(contents, (offset, junkend)), junkend)
+        return (None, offset)
+
+    def postProcessValue(self, val):
+
+        def unescape(m):
+            found = m.groupdict()
+            if found['uni']:
+                return unichr(int(found['uni'][1:], 16))
+            if found['nl']:
+                return ''
+            return self.known_escapes.get(found['single'], found['single'])
+        val = self.escape.sub(unescape, val)
+        return val
+
+
+class DefinesParser(Parser):
+    # can't merge, #unfilter needs to be the last item, which we don't support
+    canMerge = False
+
+    def __init__(self):
+        self.reKey = re.compile('^(\s*)((?:^#(?!define\s).*\s*)*)'
+                                '(#define[ \t]+(\w+)[ \t]+(.*?))([ \t]*$\n?)',
+                                re.M)
+        self.reHeader = re.compile('^\s*(#(?!define\s).*\s*)*')
+        self.reFooter = re.compile('\s*(#(?!define\s).*\s*)*$', re.M)
+        Parser.__init__(self)
+
+
+class IniParser(Parser):
+    '''
+    Parse files of the form:
+    # initial comment
+    [cat]
+    whitespace*
+    #comment
+    string=value
+    ...
+    '''
+    def __init__(self):
+        self.reHeader = re.compile('^((?:\s*|[;#].*)\n)*\[.+?\]\n', re.M)
+        self.reKey = re.compile('(\s*)((?:[;#].*\n\s*)*)((.+?)=(.*))(\n?)')
+        self.reFooter = re.compile('\s*([;#].*\s*)*$')
+        Parser.__init__(self)
+
+
+DECL, COMMENT, START, END, CONTENT = range(5)
+
+
+class BookmarksParserInner(HTMLParser):
+
+    class Token(object):
+        _type = None
+        content = ''
+
+        def __str__(self):
+            return self.content
+
+    class DeclToken(Token):
+        _type = DECL
+
+        def __init__(self, decl):
+            self.content = decl
+            pass
+
+        def __str__(self):
+            return '<!%s>' % self.content
+        pass
+
+    class CommentToken(Token):
+        _type = COMMENT
+
+        def __init__(self, comment):
+            self.content = comment
+            pass
+
+        def __str__(self):
+            return '<!--%s-->' % self.content
+        pass
+
+    class StartToken(Token):
+        _type = START
+
+        def __init__(self, tag, attrs, content):
+            self.tag = tag
+            self.attrs = dict(attrs)
+            self.content = content
+            pass
+        pass
+
+    class EndToken(Token):
+        _type = END
+
+        def __init__(self, tag):
+            self.tag = tag
+            pass
+
+        def __str__(self):
+            return '</%s>' % self.tag.upper()
+        pass
+
+    class ContentToken(Token):
+        _type = CONTENT
+
+        def __init__(self, content):
+            self.content = content
+            pass
+        pass
+
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.tokens = []
+
+    def parse(self, contents):
+        self.tokens = []
+        self.feed(contents)
+        self.close()
+        return self.tokens
+
+    # Called when we hit an end DL tag to reset the folder selections
+    def handle_decl(self, decl):
+        self.tokens.append(self.DeclToken(decl))
+
+    # Called when we hit an end DL tag to reset the folder selections
+    def handle_comment(self, comment):
+        self.tokens.append(self.CommentToken(comment))
+
+    def handle_starttag(self, tag, attrs):
+        self.tokens.append(self.StartToken(tag, attrs,
+                                           self.get_starttag_text()))
+
+    # Called when text data is encountered
+    def handle_data(self, data):
+        if self.tokens[-1]._type == CONTENT:
+            self.tokens[-1].content += data
+        else:
+            self.tokens.append(self.ContentToken(data))
+
+    def handle_charref(self, data):
+        self.handle_data('&#%s;' % data)
+
+    def handle_entityref(self, data):
+        self.handle_data('&%s;' % data)
+
+    # Called when we hit an end DL tag to reset the folder selections
+    def handle_endtag(self, tag):
+        self.tokens.append(self.EndToken(tag))
+
+
+class BookmarksParser(Parser):
+    canMerge = False
+
+    class BMEntity(object):
+        def __init__(self, key, val):
+            self.key = key
+            self.val = val
+
+    def __iter__(self):
+        p = BookmarksParserInner()
+        tks = p.parse(self.contents)
+        i = 0
+        k = []
+        for i in xrange(len(tks)):
+            t = tks[i]
+            if t._type == START:
+                k.append(t.tag)
+                keys = t.attrs.keys()
+                keys.sort()
+                for attrname in keys:
+                    yield self.BMEntity('.'.join(k) + '.@' + attrname,
+                                        t.attrs[attrname])
+                if i + 1 < len(tks) and tks[i+1]._type == CONTENT:
+                    i += 1
+                    t = tks[i]
+                    v = t.content.strip()
+                    if v:
+                        yield self.BMEntity('.'.join(k), v)
+            elif t._type == END:
+                k.pop()
+
+
+__constructors = [('\\.dtd$', DTDParser()),
+                  ('\\.properties$', PropertiesParser()),
+                  ('\\.ini$', IniParser()),
+                  ('\\.inc$', DefinesParser()),
+                  ('bookmarks\\.html$', BookmarksParser())]