24 files changed, 4003 insertions, 0 deletions
diff --git a/python/compare-locales/compare_locales/__init__.py b/python/compare-locales/compare_locales/__init__.py
new file mode 100644
index 000000000..bad265e4f
--- /dev/null
+++ b/python/compare-locales/compare_locales/__init__.py
@@ -0,0 +1 @@
+version = "1.1"
diff --git a/python/compare-locales/compare_locales/checks.py b/python/compare-locales/compare_locales/checks.py
new file mode 100644
index 000000000..ee3bef03d
--- /dev/null
+++ b/python/compare-locales/compare_locales/checks.py
@@ -0,0 +1,438 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import re
+from difflib import SequenceMatcher
+from xml import sax
+try:
+    from cStringIO import StringIO
+except ImportError:
+    from StringIO import StringIO
+
+from compare_locales.parser import DTDParser, PropertiesParser
+
+
+class Checker(object):
+    '''Abstract class to implement checks per file type.
+    '''
+    pattern = None
+
+    @classmethod
+    def use(cls, file):
+        return cls.pattern.match(file.file)
+
+    def check(self, refEnt, l10nEnt):
+        '''Given the reference and localized Entities, performs checks.
+
+        This is a generator yielding tuples of
+        - "warning" or "error", depending on what should be reported,
+        - tuple of line, column info for the error within the string
+        - description string to be shown in the report
+        '''
+        if True:
+            raise NotImplementedError("Need to subclass")
+        yield ("error", (0, 0), "This is an example error", "example")
+
+
+class PrintfException(Exception):
+    def __init__(self, msg, pos):
+        self.pos = pos
+        self.msg = msg
+
+
+class PropertiesChecker(Checker):
+    '''Tests to run on .properties files.
+    '''
+    pattern = re.compile('.*\.properties$')
+    printf = re.compile(r'%(?P<good>%|'
+                        r'(?:(?P<number>[1-9][0-9]*)\$)?'
+                        r'(?P<width>\*|[0-9]+)?'
+                        r'(?P<prec>\.(?:\*|[0-9]+)?)?'
+                        r'(?P<spec>[duxXosScpfg]))?')
+
+    def check(self, refEnt, l10nEnt):
+        '''Test for the different variable formats.
+        '''
+        refValue, l10nValue = refEnt.val, l10nEnt.val
+        refSpecs = None
+        # check for PluralForm.jsm stuff, should have the docs in the
+        # comment
+        if 'Localization_and_Plurals' in refEnt.pre_comment:
+            # For plurals, common variable pattern is #1. Try that.
+            pats = set(int(m.group(1)) for m in re.finditer('#([0-9]+)',
+                                                            refValue))
+            if len(pats) == 0:
+                return
+            lpats = set(int(m.group(1)) for m in re.finditer('#([0-9]+)',
+                                                             l10nValue))
+            if pats - lpats:
+                yield ('warning', 0, 'not all variables used in l10n',
+                       'plural')
+                return
+            if lpats - pats:
+                yield ('error', 0, 'unreplaced variables in l10n',
+                       'plural')
+                return
+            return
+        # check for lost escapes
+        raw_val = l10nEnt.raw_val
+        for m in PropertiesParser.escape.finditer(raw_val):
+            if m.group('single') and \
+               m.group('single') not in PropertiesParser.known_escapes:
+                yield ('warning', m.start(),
+                       'unknown escape sequence, \\' + m.group('single'),
+                       'escape')
+        try:
+            refSpecs = self.getPrintfSpecs(refValue)
+        except PrintfException:
+            refSpecs = []
+        if refSpecs:
+            for t in self.checkPrintf(refSpecs, l10nValue):
+                yield t
+            return
+
+    def checkPrintf(self, refSpecs, l10nValue):
+        try:
+            l10nSpecs = self.getPrintfSpecs(l10nValue)
+        except PrintfException, e:
+            yield ('error', e.pos, e.msg, 'printf')
+            return
+        if refSpecs != l10nSpecs:
+            sm = SequenceMatcher()
+            sm.set_seqs(refSpecs, l10nSpecs)
+            msgs = []
+            warn = None
+            for action, i1, i2, j1, j2 in sm.get_opcodes():
+                if action == 'equal':
+                    continue
+                if action == 'delete':
+                    # missing argument in l10n
+                    if i2 == len(refSpecs):
+                        # trailing specs missing, that's just a warning
+                        warn = ', '.join('trailing argument %d `%s` missing' %
+                                         (i+1, refSpecs[i])
+                                         for i in xrange(i1, i2))
+                    else:
+                        for i in xrange(i1, i2):
+                            msgs.append('argument %d `%s` missing' %
+                                        (i+1, refSpecs[i]))
+                    continue
+                if action == 'insert':
+                    # obsolete argument in l10n
+                    for i in xrange(j1, j2):
+                        msgs.append('argument %d `%s` obsolete' %
+                                    (i+1, l10nSpecs[i]))
+                    continue
+                if action == 'replace':
+                    for i, j in zip(xrange(i1, i2), xrange(j1, j2)):
+                        msgs.append('argument %d `%s` should be `%s`' %
+                                    (j+1, l10nSpecs[j], refSpecs[i]))
+            if msgs:
+                yield ('error', 0, ', '.join(msgs), 'printf')
+            if warn is not None:
+                yield ('warning', 0, warn, 'printf')
+
+    def getPrintfSpecs(self, val):
+        hasNumber = False
+        specs = []
+        for m in self.printf.finditer(val):
+            if m.group("good") is None:
+                # found just a '%', signal an error
+                raise PrintfException('Found single %', m.start())
+            if m.group("good") == '%':
+                # escaped %
+                continue
+            if ((hasNumber and m.group('number') is None) or
+                    (not hasNumber and specs and
+                     m.group('number') is not None)):
+                # mixed style, numbered and not
+                raise PrintfException('Mixed ordered and non-ordered args',
+                                      m.start())
+            hasNumber = m.group('number') is not None
+            if hasNumber:
+                pos = int(m.group('number')) - 1
+                ls = len(specs)
+                if pos >= ls:
+                    # pad specs
+                    nones = pos - ls
+                    specs[ls:pos] = nones*[None]
+                    specs.append(m.group('spec'))
+                else:
+                    if specs[pos] is not None:
+                        raise PrintfException('Double ordered argument %d' %
+                                              (pos+1),
+                                              m.start())
+                    specs[pos] = m.group('spec')
+            else:
+                specs.append(m.group('spec'))
+        # check for missing args
+        if hasNumber and not all(specs):
+            raise PrintfException('Ordered argument missing', 0)
+        return specs
+
+
+class DTDChecker(Checker):
+    """Tests to run on DTD files.
+
+    Uses xml.sax for the heavy lifting of xml parsing.
+
+    The code tries to parse until it doesn't find any unresolved entities
+    anymore. If it finds one, it tries to grab the key, and adds an empty
+    <!ENTITY key ""> definition to the header.
+
+    Also checks for some CSS and number heuristics in the values.
+    """
+    pattern = re.compile('.*\.dtd$')
+
+    eref = re.compile('&(%s);' % DTDParser.Name)
+    tmpl = '''<!DOCTYPE elem [%s]>
+<elem>%s</elem>
+'''
+    xmllist = set(('amp', 'lt', 'gt', 'apos', 'quot'))
+
+    def __init__(self, reference):
+        self.reference = reference
+        self.__known_entities = None
+
+    def known_entities(self, refValue):
+        if self.__known_entities is None and self.reference is not None:
+            self.__known_entities = set()
+            for ent in self.reference:
+                self.__known_entities.update(self.entities_for_value(ent.val))
+        return self.__known_entities if self.__known_entities is not None \
+            else self.entities_for_value(refValue)
+
+    def entities_for_value(self, value):
+        reflist = set(m.group(1).encode('utf-8')
+                      for m in self.eref.finditer(value))
+        reflist -= self.xmllist
+        return reflist
+
+    # Setup for XML parser, with default and text-only content handler
+    class TextContent(sax.handler.ContentHandler):
+        textcontent = ''
+
+        def characters(self, content):
+            self.textcontent += content
+
+    defaulthandler = sax.handler.ContentHandler()
+    texthandler = TextContent()
+
+    numPattern = r'([0-9]+|[0-9]*\.[0-9]+)'
+    num = re.compile('^%s$' % numPattern)
+    lengthPattern = '%s(em|px|ch|cm|in)' % numPattern
+    length = re.compile('^%s$' % lengthPattern)
+    spec = re.compile(r'((?:min\-)?(?:width|height))\s*:\s*%s' %
+                      lengthPattern)
+    style = re.compile(r'^%(spec)s\s*(;\s*%(spec)s\s*)*;?$' %
+                       {'spec': spec.pattern})
+
+    processContent = None
+
+    def check(self, refEnt, l10nEnt):
+        """Try to parse the refvalue inside a dummy element, and keep
+        track of entities that we need to define to make that work.
+
+        Return a checker that offers just those entities.
+        """
+        refValue, l10nValue = refEnt.val, l10nEnt.val
+        # find entities the refValue references,
+        # reusing markup from DTDParser.
+        reflist = self.known_entities(refValue)
+        inContext = self.entities_for_value(refValue)
+        entities = ''.join('<!ENTITY %s "">' % s for s in sorted(reflist))
+        parser = sax.make_parser()
+        parser.setFeature(sax.handler.feature_external_ges, False)
+
+        parser.setContentHandler(self.defaulthandler)
+        try:
+            parser.parse(StringIO(self.tmpl %
+                                  (entities, refValue.encode('utf-8'))))
+            # also catch stray %
+            parser.parse(StringIO(self.tmpl %
+                                  (refEnt.all.encode('utf-8') + entities,
+                                   '&%s;' % refEnt.key.encode('utf-8'))))
+        except sax.SAXParseException, e:
+            yield ('warning',
+                   (0, 0),
+                   "can't parse en-US value", 'xmlparse')
+
+        # find entities the l10nValue references,
+        # reusing markup from DTDParser.
+        l10nlist = self.entities_for_value(l10nValue)
+        missing = sorted(l10nlist - reflist)
+        _entities = entities + ''.join('<!ENTITY %s "">' % s for s in missing)
+        if self.processContent is not None:
+            self.texthandler.textcontent = ''
+            parser.setContentHandler(self.texthandler)
+        try:
+            parser.parse(StringIO(self.tmpl % (_entities,
+                         l10nValue.encode('utf-8'))))
+            # also catch stray %
+            # if this fails, we need to substract the entity definition
+            parser.setContentHandler(self.defaulthandler)
+            parser.parse(StringIO(self.tmpl % (
+                l10nEnt.all.encode('utf-8') + _entities,
+                '&%s;' % l10nEnt.key.encode('utf-8'))))
+        except sax.SAXParseException, e:
+            # xml parse error, yield error
+            # sometimes, the error is reported on our fake closing
+            # element, make that the end of the last line
+            lnr = e.getLineNumber() - 1
+            lines = l10nValue.splitlines()
+            if lnr > len(lines):
+                lnr = len(lines)
+                col = len(lines[lnr-1])
+            else:
+                col = e.getColumnNumber()
+                if lnr == 1:
+                    # first line starts with <elem>, substract
+                    col -= len("<elem>")
+                elif lnr == 0:
+                    col -= len("<!DOCTYPE elem [")  # first line is DOCTYPE
+            yield ('error', (lnr, col), ' '.join(e.args), 'xmlparse')
+
+        warntmpl = u'Referencing unknown entity `%s`'
+        if reflist:
+            if inContext:
+                elsewhere = reflist - inContext
+                warntmpl += ' (%s used in context' % \
+                    ', '.join(sorted(inContext))
+                if elsewhere:
+                    warntmpl += ', %s known)' % ', '.join(sorted(elsewhere))
+                else:
+                    warntmpl += ')'
+            else:
+                warntmpl += ' (%s known)' % ', '.join(sorted(reflist))
+        for key in missing:
+            yield ('warning', (0, 0), warntmpl % key.decode('utf-8'),
+                   'xmlparse')
+        if inContext and l10nlist and l10nlist - inContext - set(missing):
+            mismatch = sorted(l10nlist - inContext - set(missing))
+            for key in mismatch:
+                yield ('warning', (0, 0),
+                       'Entity %s referenced, but %s used in context' % (
+                           key.decode('utf-8'),
+                           ', '.join(sorted(inContext))
+                ), 'xmlparse')
+
+        # Number check
+        if self.num.match(refValue) and not self.num.match(l10nValue):
+            yield ('warning', 0, 'reference is a number', 'number')
+        # CSS checks
+        # just a length, width="100em"
+        if self.length.match(refValue) and not self.length.match(l10nValue):
+            yield ('error', 0, 'reference is a CSS length', 'css')
+        # real CSS spec, style="width:100px;"
+        if self.style.match(refValue):
+            if not self.style.match(l10nValue):
+                yield ('error', 0, 'reference is a CSS spec', 'css')
+            else:
+                # warn if different properties or units
+                refMap = dict((s, u) for s, _, u in
+                              self.spec.findall(refValue))
+                msgs = []
+                for s, _, u in self.spec.findall(l10nValue):
+                    if s not in refMap:
+                        msgs.insert(0, '%s only in l10n' % s)
+                        continue
+                    else:
+                        ru = refMap.pop(s)
+                        if u != ru:
+                            msgs.append("units for %s don't match "
+                                        "(%s != %s)" % (s, u, ru))
+                for s in refMap.iterkeys():
+                    msgs.insert(0, '%s only in reference' % s)
+                if msgs:
+                    yield ('warning', 0, ', '.join(msgs), 'css')
+
+        if self.processContent is not None:
+            for t in self.processContent(self.texthandler.textcontent):
+                yield t
+
+
+class PrincessAndroid(DTDChecker):
+    """Checker for the string values that Android puts into an XML container.
+
+    http://developer.android.com/guide/topics/resources/string-resource.html#FormattingAndStyling  # noqa
+    has more info. Check for unescaped apostrophes and bad unicode escapes.
+    """
+    quoted = re.compile("(?P<q>[\"']).*(?P=q)$")
+
+    def unicode_escape(self, str):
+        """Helper method to try to decode all unicode escapes in a string.
+
+        This code uses the standard python decode for unicode-escape, but
+        that's somewhat tricky, as its input needs to be ascii. To get to
+        ascii, the unicode string gets converted to ascii with
+        backslashreplace, i.e., all non-ascii unicode chars get unicode
+        escaped. And then we try to roll all of that back.
+        Now, when that hits an error, that's from the original string, and we
+        need to search for the actual error position in the original string,
+        as the backslashreplace code changes string positions quite badly.
+        See also the last check in TestAndroid.test_android_dtd, with a
+        lengthy chinese string.
+        """
+        val = str.encode('ascii', 'backslashreplace')
+        try:
+            val.decode('unicode-escape')
+        except UnicodeDecodeError, e:
+            args = list(e.args)
+            badstring = args[1][args[2]:args[3]]
+            i = len(args[1][:args[2]].decode('unicode-escape'))
+            args[2] = i
+            args[3] = i + len(badstring)
+            raise UnicodeDecodeError(*args)
+
+    @classmethod
+    def use(cls, file):
+        """Use this Checker only for DTD files in embedding/android."""
+        return (file.module in ("embedding/android",
+                                "mobile/android/base") and
+                cls.pattern.match(file.file))
+
+    def processContent(self, val):
+        """Actual check code.
+        Check for unicode escapes and unescaped quotes and apostrophes,
+        if string's not quoted.
+        """
+        # first, try to decode unicode escapes
+        try:
+            self.unicode_escape(val)
+        except UnicodeDecodeError, e:
+            yield ('error', e.args[2], e.args[4], 'android')
+        # check for unescaped single or double quotes.
+        # first, see if the complete string is single or double quoted,
+        # that changes the rules
+        m = self.quoted.match(val)
+        if m:
+            q = m.group('q')
+            offset = 0
+            val = val[1:-1]  # strip quotes
+        else:
+            q = "[\"']"
+            offset = -1
+        stray_quot = re.compile(r"[\\\\]*(%s)" % q)
+
+        for m in stray_quot.finditer(val):
+            if len(m.group(0)) % 2:
+                # found an unescaped single or double quote, which message?
+                if m.group(1) == '"':
+                    msg = u"Quotes in Android DTDs need escaping with \\\" "\
+                          u"or \\u0022, or put string in apostrophes."
+                else:
+                    msg = u"Apostrophes in Android DTDs need escaping with "\
+                          u"\\' or \\u0027, or use \u2019, or put string in "\
+                          u"quotes."
+                yield ('error', m.end(0)+offset, msg, 'android')
+
+
+def getChecker(file, reference=None):
+    if PropertiesChecker.use(file):
+        return PropertiesChecker()
+    if PrincessAndroid.use(file):
+        return PrincessAndroid(reference)
+    if DTDChecker.use(file):
+        return DTDChecker(reference)
+    return None
diff --git a/python/compare-locales/compare_locales/commands.py b/python/compare-locales/compare_locales/commands.py
new file mode 100644
index 000000000..61b58ec4b
--- /dev/null
+++ b/python/compare-locales/compare_locales/commands.py
@@ -0,0 +1,154 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+'Commands exposed to commandlines'
+
+import logging
+from optparse import OptionParser, make_option
+
+from compare_locales.paths import EnumerateApp
+from compare_locales.compare import compareApp, compareDirs
+from compare_locales.webapps import compare_web_app
+
+
+class BaseCommand(object):
+    """Base class for compare-locales commands.
+    This handles command line parsing, and general sugar for setuptools
+    entry_points.
+    """
+    options = [
+        make_option('-v', '--verbose', action='count', dest='v', default=0,
+                    help='Make more noise'),
+        make_option('-q', '--quiet', action='count', dest='q', default=0,
+                    help='Make less noise'),
+        make_option('-m', '--merge',
+                    help='''Use this directory to stage merged files,
+use {ab_CD} to specify a different directory for each locale'''),
+    ]
+    data_option = make_option('--data', choices=['text', 'exhibit', 'json'],
+                              default='text',
+                              help='''Choose data and format (one of text,
+exhibit, json); text: (default) Show which files miss which strings, together
+with warnings and errors. Also prints a summary; json: Serialize the internal
+tree, useful for tools. Also always succeeds; exhibit: Serialize the summary
+data in a json useful for Exhibit
+''')
+
+    def __init__(self):
+        self.parser = None
+
+    def get_parser(self):
+        """Get an OptionParser, with class docstring as usage, and
+        self.options.
+        """
+        parser = OptionParser()
+        parser.set_usage(self.__doc__)
+        for option in self.options:
+            parser.add_option(option)
+        return parser
+
+    @classmethod
+    def call(cls):
+        """Entry_point for setuptools.
+        The actual command handling is done in the handle() method of the
+        subclasses.
+        """
+        cmd = cls()
+        cmd.handle_()
+
+    def handle_(self):
+        """The instance part of the classmethod call."""
+        self.parser = self.get_parser()
+        (options, args) = self.parser.parse_args()
+        # log as verbose or quiet as we want, warn by default
+        logging.basicConfig()
+        logging.getLogger().setLevel(logging.WARNING -
+                                     (options.v - options.q)*10)
+        observer = self.handle(args, options)
+        print observer.serialize(type=options.data).encode('utf-8', 'replace')
+
+    def handle(self, args, options):
+        """Subclasses need to implement this method for the actual
+        command handling.
+        """
+        raise NotImplementedError
+
+
+class CompareLocales(BaseCommand):
+    """usage: %prog [options] l10n.ini l10n_base_dir [locale ...]
+
+Check the localization status of a gecko application.
+The first argument is a path to the l10n.ini file for the application,
+followed by the base directory of the localization repositories.
+Then you pass in the list of locale codes you want to compare. If there are
+not locales given, the list of locales will be taken from the all-locales file
+of the application\'s l10n.ini."""
+
+    options = BaseCommand.options + [
+        make_option('--clobber-merge', action="store_true", default=False,
+                    dest='clobber',
+                    help="""WARNING: DATALOSS.
+Use this option with care. If specified, the merge directory will
+be clobbered for each module. That means, the subdirectory will
+be completely removed, any files that were there are lost.
+Be careful to specify the right merge directory when using this option."""),
+        make_option('-r', '--reference', default='en-US', dest='reference',
+                    help='Explicitly set the reference '
+                    'localization. [default: en-US]'),
+        BaseCommand.data_option
+    ]
+
+    def handle(self, args, options):
+        if len(args) < 2:
+            self.parser.error('Need to pass in list of languages')
+        inipath, l10nbase = args[:2]
+        locales = args[2:]
+        app = EnumerateApp(inipath, l10nbase, locales)
+        app.reference = options.reference
+        try:
+            observer = compareApp(app, merge_stage=options.merge,
+                                  clobber=options.clobber)
+        except (OSError, IOError), exc:
+            print "FAIL: " + str(exc)
+            self.parser.exit(2)
+        return observer
+
+
+class CompareDirs(BaseCommand):
+    """usage: %prog [options] reference localization
+
+Check the localization status of a directory tree.
+The first argument is a path to the reference data,the second is the
+localization to be tested."""
+
+    options = BaseCommand.options + [
+        BaseCommand.data_option
+    ]
+
+    def handle(self, args, options):
+        if len(args) != 2:
+            self.parser.error('Reference and localizatino required')
+        reference, locale = args
+        observer = compareDirs(reference, locale, merge_stage=options.merge)
+        return observer
+
+
+class CompareWebApp(BaseCommand):
+    """usage: %prog [options] webapp [locale locale]
+
+Check the localization status of a gaia-style web app.
+The first argument is the directory of the web app.
+Following arguments explicitly state the locales to test.
+If none are given, test all locales in manifest.webapp or files."""
+
+    options = BaseCommand.options[:-1] + [
+        BaseCommand.data_option]
+
+    def handle(self, args, options):
+        if len(args) < 1:
+            self.parser.error('Webapp directory required')
+        basedir = args[0]
+        locales = args[1:]
+        observer = compare_web_app(basedir, locales)
+        return observer
diff --git a/python/compare-locales/compare_locales/compare.py b/python/compare-locales/compare_locales/compare.py
new file mode 100644
index 000000000..4f71c46f8
--- /dev/null
+++ b/python/compare-locales/compare_locales/compare.py
@@ -0,0 +1,638 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+'Mozilla l10n compare locales tool'
+
+import codecs
+import os
+import os.path
+import shutil
+import re
+from difflib import SequenceMatcher
+from collections import defaultdict
+
+try:
+    from json import dumps
+except:
+    from simplejson import dumps
+
+from compare_locales import parser
+from compare_locales import paths
+from compare_locales.checks import getChecker
+
+
+class Tree(object):
+    def __init__(self, valuetype):
+        self.branches = dict()
+        self.valuetype = valuetype
+        self.value = None
+
+    def __getitem__(self, leaf):
+        parts = []
+        if isinstance(leaf, paths.File):
+            parts = [p for p in [leaf.locale, leaf.module] if p] + \
+                leaf.file.split('/')
+        else:
+            parts = leaf.split('/')
+        return self.__get(parts)
+
+    def __get(self, parts):
+        common = None
+        old = None
+        new = tuple(parts)
+        t = self
+        for k, v in self.branches.iteritems():
+            for i, part in enumerate(zip(k, parts)):
+                if part[0] != part[1]:
+                    i -= 1
+                    break
+            if i < 0:
+                continue
+            i += 1
+            common = tuple(k[:i])
+            old = tuple(k[i:])
+            new = tuple(parts[i:])
+            break
+        if old:
+            self.branches.pop(k)
+            t = Tree(self.valuetype)
+            t.branches[old] = v
+            self.branches[common] = t
+        elif common:
+            t = self.branches[common]
+        if new:
+            if common:
+                return t.__get(new)
+            t2 = t
+            t = Tree(self.valuetype)
+            t2.branches[new] = t
+        if t.value is None:
+            t.value = t.valuetype()
+        return t.value
+
+    indent = '  '
+
+    def getContent(self, depth=0):
+        '''
+        Returns iterator of (depth, flag, key_or_value) tuples.
+        If flag is 'value', key_or_value is a value object, otherwise
+        (flag is 'key') it's a key string.
+        '''
+        keys = self.branches.keys()
+        keys.sort()
+        if self.value is not None:
+            yield (depth, 'value', self.value)
+        for key in keys:
+            yield (depth, 'key', key)
+            for child in self.branches[key].getContent(depth + 1):
+                yield child
+
+    def toJSON(self):
+        '''
+        Returns this Tree as a JSON-able tree of hashes.
+        Only the values need to take care that they're JSON-able.
+        '''
+        json = {}
+        keys = self.branches.keys()
+        keys.sort()
+        if self.value is not None:
+            json['value'] = self.value
+        children = [('/'.join(key), self.branches[key].toJSON())
+                    for key in keys]
+        if children:
+            json['children'] = children
+        return json
+
+    def getStrRows(self):
+        def tostr(t):
+            if t[1] == 'key':
+                return self.indent * t[0] + '/'.join(t[2])
+            return self.indent * (t[0] + 1) + str(t[2])
+
+        return map(tostr, self.getContent())
+
+    def __str__(self):
+        return '\n'.join(self.getStrRows())
+
+
+class AddRemove(SequenceMatcher):
+    def __init__(self):
+        SequenceMatcher.__init__(self, None, None, None)
+
+    def set_left(self, left):
+        if not isinstance(left, list):
+            left = [l for l in left]
+        self.set_seq1(left)
+
+    def set_right(self, right):
+        if not isinstance(right, list):
+            right = [l for l in right]
+        self.set_seq2(right)
+
+    def __iter__(self):
+        for tag, i1, i2, j1, j2 in self.get_opcodes():
+            if tag == 'equal':
+                for pair in zip(self.a[i1:i2], self.b[j1:j2]):
+                    yield ('equal', pair)
+            elif tag == 'delete':
+                for item in self.a[i1:i2]:
+                    yield ('delete', item)
+            elif tag == 'insert':
+                for item in self.b[j1:j2]:
+                    yield ('add', item)
+            else:
+                # tag == 'replace'
+                for item in self.a[i1:i2]:
+                    yield ('delete', item)
+                for item in self.b[j1:j2]:
+                    yield ('add', item)
+
+
+class DirectoryCompare(SequenceMatcher):
+    def __init__(self, reference):
+        SequenceMatcher.__init__(self, None, [i for i in reference],
+                                 [])
+        self.watcher = None
+
+    def setWatcher(self, watcher):
+        self.watcher = watcher
+
+    def compareWith(self, other):
+        if not self.watcher:
+            return
+        self.set_seq2([i for i in other])
+        for tag, i1, i2, j1, j2 in self.get_opcodes():
+            if tag == 'equal':
+                for i, j in zip(xrange(i1, i2), xrange(j1, j2)):
+                    self.watcher.compare(self.a[i], self.b[j])
+            elif tag == 'delete':
+                for i in xrange(i1, i2):
+                    self.watcher.add(self.a[i], other.cloneFile(self.a[i]))
+            elif tag == 'insert':
+                for j in xrange(j1, j2):
+                    self.watcher.remove(self.b[j])
+            else:
+                for j in xrange(j1, j2):
+                    self.watcher.remove(self.b[j])
+                for i in xrange(i1, i2):
+                    self.watcher.add(self.a[i], other.cloneFile(self.a[i]))
+
+
+class Observer(object):
+    stat_cats = ['missing', 'obsolete', 'missingInFiles', 'report',
+                 'changed', 'unchanged', 'keys']
+
+    def __init__(self):
+        class intdict(defaultdict):
+            def __init__(self):
+                defaultdict.__init__(self, int)
+
+        self.summary = defaultdict(intdict)
+        self.details = Tree(dict)
+        self.filter = None
+
+    # support pickling
+    def __getstate__(self):
+        return dict(summary=self.getSummary(), details=self.details)
+
+    def __setstate__(self, state):
+        class intdict(defaultdict):
+            def __init__(self):
+                defaultdict.__init__(self, int)
+
+        self.summary = defaultdict(intdict)
+        if 'summary' in state:
+            for loc, stats in state['summary'].iteritems():
+                self.summary[loc].update(stats)
+        self.details = state['details']
+        self.filter = None
+
+    def getSummary(self):
+        plaindict = {}
+        for k, v in self.summary.iteritems():
+            plaindict[k] = dict(v)
+        return plaindict
+
+    def toJSON(self):
+        return dict(summary=self.getSummary(), details=self.details.toJSON())
+
+    def notify(self, category, file, data):
+        rv = "error"
+        if category in self.stat_cats:
+            # these get called post reporting just for stats
+            # return "error" to forward them to other other_observers
+            self.summary[file.locale][category] += data
+            # keep track of how many strings are in a missing file
+            # we got the {'missingFile': 'error'} from the first pass
+            if category == 'missingInFiles':
+                self.details[file]['strings'] = data
+            return "error"
+        if category in ['missingFile', 'obsoleteFile']:
+            if self.filter is not None:
+                rv = self.filter(file)
+            if rv != "ignore":
+                self.details[file][category] = rv
+            return rv
+        if category in ['missingEntity', 'obsoleteEntity']:
+            if self.filter is not None:
+                rv = self.filter(file, data)
+            if rv == "ignore":
+                return rv
+            v = self.details[file]
+            try:
+                v[category].append(data)
+            except KeyError:
+                v[category] = [data]
+            return rv
+        if category == 'error':
+            try:
+                self.details[file][category].append(data)
+            except KeyError:
+                self.details[file][category] = [data]
+            self.summary[file.locale]['errors'] += 1
+        elif category == 'warning':
+            try:
+                self.details[file][category].append(data)
+            except KeyError:
+                self.details[file][category] = [data]
+            self.summary[file.locale]['warnings'] += 1
+        return rv
+
+    def toExhibit(self):
+        items = []
+        for locale in sorted(self.summary.iterkeys()):
+            summary = self.summary[locale]
+            if locale is not None:
+                item = {'id': 'xxx/' + locale,
+                        'label': locale,
+                        'locale': locale}
+            else:
+                item = {'id': 'xxx',
+                        'label': 'xxx',
+                        'locale': 'xxx'}
+            item['type'] = 'Build'
+            total = sum([summary[k]
+                         for k in ('changed', 'unchanged', 'report', 'missing',
+                                   'missingInFiles')
+                         if k in summary])
+            rate = (('changed' in summary and summary['changed'] * 100) or
+                    0) / total
+            item.update((k, summary.get(k, 0))
+                        for k in ('changed', 'unchanged'))
+            item.update((k, summary[k])
+                        for k in ('report', 'errors', 'warnings')
+                        if k in summary)
+            item['missing'] = summary.get('missing', 0) + \
+                summary.get('missingInFiles', 0)
+            item['completion'] = rate
+            item['total'] = total
+            result = 'success'
+            if item.get('warnings', 0):
+                result = 'warning'
+            if item.get('errors', 0) or item.get('missing', 0):
+                result = 'failure'
+            item['result'] = result
+            items.append(item)
+        data = {
+            "properties": dict.fromkeys(
+                ("completion", "errors", "warnings", "missing", "report",
+                 "unchanged", "changed", "obsolete"),
+                {"valueType": "number"}),
+            "types": {
+                "Build": {"pluralLabel": "Builds"}
+            }}
+        data['items'] = items
+        return dumps(data, indent=2)
+
+    def serialize(self, type="text"):
+        if type == "exhibit":
+            return self.toExhibit()
+        if type == "json":
+            return dumps(self.toJSON())
+
+        def tostr(t):
+            if t[1] == 'key':
+                return '  ' * t[0] + '/'.join(t[2])
+            o = []
+            indent = '  ' * (t[0] + 1)
+            if 'error' in t[2]:
+                o += [indent + 'ERROR: ' + e for e in t[2]['error']]
+            if 'warning' in t[2]:
+                o += [indent + 'WARNING: ' + e for e in t[2]['warning']]
+            if 'missingEntity' in t[2] or 'obsoleteEntity' in t[2]:
+                missingEntities = ('missingEntity' in t[2] and
+                                   t[2]['missingEntity']) or []
+                obsoleteEntities = ('obsoleteEntity' in t[2] and
+                                    t[2]['obsoleteEntity']) or []
+                entities = missingEntities + obsoleteEntities
+                entities.sort()
+                for entity in entities:
+                    op = '+'
+                    if entity in obsoleteEntities:
+                        op = '-'
+                    o.append(indent + op + entity)
+            elif 'missingFile' in t[2]:
+                o.append(indent + '// add and localize this file')
+            elif 'obsoleteFile' in t[2]:
+                o.append(indent + '// remove this file')
+            return '\n'.join(o)
+
+        out = []
+        for locale, summary in sorted(self.summary.iteritems()):
+            if locale is not None:
+                out.append(locale + ':')
+            out += [k + ': ' + str(v) for k, v in sorted(summary.iteritems())]
+            total = sum([summary[k]
+                         for k in ['changed', 'unchanged', 'report', 'missing',
+                                   'missingInFiles']
+                         if k in summary])
+            rate = 0
+            if total:
+                rate = (('changed' in summary and summary['changed'] * 100) or
+                        0) / total
+            out.append('%d%% of entries changed' % rate)
+        return '\n'.join(map(tostr, self.details.getContent()) + out)
+
+    def __str__(self):
+        return 'observer'
+
+
+class ContentComparer:
+    keyRE = re.compile('[kK]ey')
+    nl = re.compile('\n', re.M)
+
+    def __init__(self):
+        '''Create a ContentComparer.
+        observer is usually a instance of Observer. The return values
+        of the notify method are used to control the handling of missing
+        entities.
+        '''
+        self.reference = dict()
+        self.observer = Observer()
+        self.other_observers = []
+        self.merge_stage = None
+
+    def add_observer(self, obs):
+        '''Add a non-filtering observer.
+        Results from the notify calls are ignored.
+        '''
+        self.other_observers.append(obs)
+
+    def set_merge_stage(self, merge_stage):
+        self.merge_stage = merge_stage
+
+    def merge(self, ref_entities, ref_map, ref_file, l10n_file, missing,
+              skips, p):
+        outfile = os.path.join(self.merge_stage, l10n_file.module,
+                               l10n_file.file)
+        outdir = os.path.dirname(outfile)
+        if not os.path.isdir(outdir):
+            os.makedirs(outdir)
+        if not p.canMerge:
+            shutil.copyfile(ref_file.fullpath, outfile)
+            print "copied reference to " + outfile
+            return
+        if skips:
+            # skips come in ordered by key name, we need them in file order
+            skips.sort(key=lambda s: s.span[0])
+        trailing = (['\n'] +
+                    [ref_entities[ref_map[key]].all for key in missing] +
+                    [ref_entities[ref_map[skip.key]].all for skip in skips
+                     if not isinstance(skip, parser.Junk)])
+        if skips:
+            # we need to skip a few errornous blocks in the input, copy by hand
+            f = codecs.open(outfile, 'wb', p.encoding)
+            offset = 0
+            for skip in skips:
+                chunk = skip.span
+                f.write(p.contents[offset:chunk[0]])
+                offset = chunk[1]
+            f.write(p.contents[offset:])
+        else:
+            shutil.copyfile(l10n_file.fullpath, outfile)
+            f = codecs.open(outfile, 'ab', p.encoding)
+        print "adding to " + outfile
+
+        def ensureNewline(s):
+            if not s.endswith('\n'):
+                return s + '\n'
+            return s
+
+        f.write(''.join(map(ensureNewline, trailing)))
+        f.close()
+
+    def notify(self, category, file, data):
+        """Check observer for the found data, and if it's
+        not to ignore, notify other_observers.
+        """
+        rv = self.observer.notify(category, file, data)
+        if rv == 'ignore':
+            return rv
+        for obs in self.other_observers:
+            # non-filtering other_observers, ignore results
+            obs.notify(category, file, data)
+        return rv
+
+    def remove(self, obsolete):
+        self.notify('obsoleteFile', obsolete, None)
+        pass
+
+    def compare(self, ref_file, l10n):
+        try:
+            p = parser.getParser(ref_file.file)
+        except UserWarning:
+            # no comparison, XXX report?
+            return
+        if ref_file not in self.reference:
+            # we didn't parse this before
+            try:
+                p.readContents(ref_file.getContents())
+            except Exception, e:
+                self.notify('error', ref_file, str(e))
+                return
+            self.reference[ref_file] = p.parse()
+        ref = self.reference[ref_file]
+        ref_list = ref[1].keys()
+        ref_list.sort()
+        try:
+            p.readContents(l10n.getContents())
+            l10n_entities, l10n_map = p.parse()
+        except Exception, e:
+            self.notify('error', l10n, str(e))
+            return
+        lines = []
+
+        def _getLine(offset):
+            if not lines:
+                lines.append(0)
+                for m in self.nl.finditer(p.contents):
+                    lines.append(m.end())
+            for i in xrange(len(lines), 0, -1):
+                if offset >= lines[i - 1]:
+                    return (i, offset - lines[i - 1])
+            return (1, offset)
+
+        l10n_list = l10n_map.keys()
+        l10n_list.sort()
+        ar = AddRemove()
+        ar.set_left(ref_list)
+        ar.set_right(l10n_list)
+        report = missing = obsolete = changed = unchanged = keys = 0
+        missings = []
+        skips = []
+        checker = getChecker(l10n, reference=ref[0])
+        for action, item_or_pair in ar:
+            if action == 'delete':
+                # missing entity
+                _rv = self.notify('missingEntity', l10n, item_or_pair)
+                if _rv == "ignore":
+                    continue
+                if _rv == "error":
+                    # only add to missing entities for l10n-merge on error,
+                    # not report
+                    missings.append(item_or_pair)
+                    missing += 1
+                else:
+                    # just report
+                    report += 1
+            elif action == 'add':
+                # obsolete entity or junk
+                if isinstance(l10n_entities[l10n_map[item_or_pair]],
+                              parser.Junk):
+                    junk = l10n_entities[l10n_map[item_or_pair]]
+                    params = (junk.val,) + junk.span
+                    self.notify('error', l10n,
+                                'Unparsed content "%s" at %d-%d' % params)
+                    if self.merge_stage is not None:
+                        skips.append(junk)
+                elif self.notify('obsoleteEntity', l10n,
+                                 item_or_pair) != 'ignore':
+                    obsolete += 1
+            else:
+                # entity found in both ref and l10n, check for changed
+                entity = item_or_pair[0]
+                refent = ref[0][ref[1][entity]]
+                l10nent = l10n_entities[l10n_map[entity]]
+                if self.keyRE.search(entity):
+                    keys += 1
+                else:
+                    if refent.val == l10nent.val:
+                        self.doUnchanged(l10nent)
+                        unchanged += 1
+                    else:
+                        self.doChanged(ref_file, refent, l10nent)
+                        changed += 1
+                        # run checks:
+                if checker:
+                    for tp, pos, msg, cat in checker.check(refent, l10nent):
+                        # compute real src position, if first line,
+                        # col needs adjustment
+                        _l, _offset = _getLine(l10nent.val_span[0])
+                        if isinstance(pos, tuple):
+                            # line, column
+                            if pos[0] == 1:
+                                col = pos[1] + _offset
+                            else:
+                                col = pos[1]
+                            _l += pos[0] - 1
+                        else:
+                            _l, col = _getLine(l10nent.val_span[0] + pos)
+                            # skip error entities when merging
+                        if tp == 'error' and self.merge_stage is not None:
+                            skips.append(l10nent)
+                        self.notify(tp, l10n,
+                                    u"%s at line %d, column %d for %s" %
+                                    (msg, _l, col, refent.key))
+                pass
+        if missing:
+            self.notify('missing', l10n, missing)
+        if self.merge_stage is not None and (missings or skips):
+            self.merge(ref[0], ref[1], ref_file, l10n, missings, skips, p)
+        if report:
+            self.notify('report', l10n, report)
+        if obsolete:
+            self.notify('obsolete', l10n, obsolete)
+        if changed:
+            self.notify('changed', l10n, changed)
+        if unchanged:
+            self.notify('unchanged', l10n, unchanged)
+        if keys:
+            self.notify('keys', l10n, keys)
+        pass
+
+    def add(self, orig, missing):
+        if self.notify('missingFile', missing, None) == "ignore":
+            # filter said that we don't need this file, don't count it
+            return
+        f = orig
+        try:
+            p = parser.getParser(f.file)
+        except UserWarning:
+            return
+        try:
+            p.readContents(f.getContents())
+            entities, map = p.parse()
+        except Exception, e:
+            self.notify('error', f, str(e))
+            return
+        self.notify('missingInFiles', missing, len(map))
+
+    def doUnchanged(self, entity):
+        # overload this if needed
+        pass
+
+    def doChanged(self, file, ref_entity, l10n_entity):
+        # overload this if needed
+        pass
+
+
+def compareApp(app, other_observer=None, merge_stage=None, clobber=False):
+    '''Compare locales set in app.
+
+    Optional arguments are:
+    - other_observer. A object implementing
+        notify(category, _file, data)
+      The return values of that callback are ignored.
+    - merge_stage. A directory to be used for staging the output of
+      l10n-merge.
+    - clobber. Clobber the module subdirectories of the merge dir as we go.
+      Use wisely, as it might cause data loss.
+    '''
+    comparer = ContentComparer()
+    if other_observer is not None:
+        comparer.add_observer(other_observer)
+    comparer.observer.filter = app.filter
+    for module, reference, locales in app:
+        dir_comp = DirectoryCompare(reference)
+        dir_comp.setWatcher(comparer)
+        for _, localization in locales:
+            if merge_stage is not None:
+                locale_merge = merge_stage.format(ab_CD=localization.locale)
+                comparer.set_merge_stage(locale_merge)
+                if clobber:
+                    # if clobber, remove the stage for the module if it exists
+                    clobberdir = os.path.join(locale_merge, module)
+                    if os.path.exists(clobberdir):
+                        shutil.rmtree(clobberdir)
+                        print "clobbered " + clobberdir
+            dir_comp.compareWith(localization)
+    return comparer.observer
+
+
+def compareDirs(reference, locale, other_observer=None, merge_stage=None):
+    '''Compare reference and locale dir.
+
+    Optional arguments are:
+    - other_observer. A object implementing
+        notify(category, _file, data)
+      The return values of that callback are ignored.
+    '''
+    comparer = ContentComparer()
+    if other_observer is not None:
+        comparer.add_observer(other_observer)
+    comparer.set_merge_stage(merge_stage)
+    dir_comp = DirectoryCompare(paths.EnumerateDir(reference))
+    dir_comp.setWatcher(comparer)
+    dir_comp.compareWith(paths.EnumerateDir(locale))
+    return comparer.observer
diff --git a/python/compare-locales/compare_locales/parser.py b/python/compare-locales/compare_locales/parser.py
new file mode 100644
index 000000000..a97cf201b
--- /dev/null
+++ b/python/compare-locales/compare_locales/parser.py
@@ -0,0 +1,521 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import re
+import codecs
+import logging
+from HTMLParser import HTMLParser
+
+__constructors = []
+
+
+class Entity(object):
+    '''
+    Abstraction layer for a localizable entity.
+    Currently supported are grammars of the form:
+
+    1: pre white space
+    2: pre comments
+    3: entity definition
+    4: entity key (name)
+    5: entity value
+    6: post comment (and white space) in the same line (dtd only)
+                                                 <--[1]
+    <!-- pre comments -->                        <--[2]
+    <!ENTITY key "value"> <!-- comment -->
+
+    <-------[3]---------><------[6]------>
+    '''
+    def __init__(self, contents, pp,
+                 span, pre_ws_span, pre_comment_span, def_span,
+                 key_span, val_span, post_span):
+        self.contents = contents
+        self.span = span
+        self.pre_ws_span = pre_ws_span
+        self.pre_comment_span = pre_comment_span
+        self.def_span = def_span
+        self.key_span = key_span
+        self.val_span = val_span
+        self.post_span = post_span
+        self.pp = pp
+        pass
+
+    # getter helpers
+
+    def get_all(self):
+        return self.contents[self.span[0]:self.span[1]]
+
+    def get_pre_ws(self):
+        return self.contents[self.pre_ws_span[0]:self.pre_ws_span[1]]
+
+    def get_pre_comment(self):
+        return self.contents[self.pre_comment_span[0]:
+                             self.pre_comment_span[1]]
+
+    def get_def(self):
+        return self.contents[self.def_span[0]:self.def_span[1]]
+
+    def get_key(self):
+        return self.contents[self.key_span[0]:self.key_span[1]]
+
+    def get_val(self):
+        return self.pp(self.contents[self.val_span[0]:self.val_span[1]])
+
+    def get_raw_val(self):
+        return self.contents[self.val_span[0]:self.val_span[1]]
+
+    def get_post(self):
+        return self.contents[self.post_span[0]:self.post_span[1]]
+
+    # getters
+
+    all = property(get_all)
+    pre_ws = property(get_pre_ws)
+    pre_comment = property(get_pre_comment)
+    definition = property(get_def)
+    key = property(get_key)
+    val = property(get_val)
+    raw_val = property(get_raw_val)
+    post = property(get_post)
+
+    def __repr__(self):
+        return self.key
+
+
+class Junk(object):
+    '''
+    An almost-Entity, representing junk data that we didn't parse.
+    This way, we can signal bad content as stuff we don't understand.
+    And the either fix that, or report real bugs in localizations.
+    '''
+    junkid = 0
+
+    def __init__(self, contents, span):
+        self.contents = contents
+        self.span = span
+        self.pre_ws = self.pre_comment = self.definition = self.post = ''
+        self.__class__.junkid += 1
+        self.key = '_junk_%d_%d-%d' % (self.__class__.junkid, span[0], span[1])
+
+    # getter helpers
+    def get_all(self):
+        return self.contents[self.span[0]:self.span[1]]
+
+    # getters
+    all = property(get_all)
+    val = property(get_all)
+
+    def __repr__(self):
+        return self.key
+
+
+class Parser:
+    canMerge = True
+
+    def __init__(self):
+        if not hasattr(self, 'encoding'):
+            self.encoding = 'utf-8'
+        pass
+
+    def readFile(self, file):
+        f = codecs.open(file, 'r', self.encoding)
+        try:
+            self.contents = f.read()
+        except UnicodeDecodeError, e:
+            (logging.getLogger('locales')
+                    .error("Can't read file: " + file + '; ' + str(e)))
+            self.contents = u''
+        f.close()
+
+    def readContents(self, contents):
+        (self.contents, length) = codecs.getdecoder(self.encoding)(contents)
+
+    def parse(self):
+        l = []
+        m = {}
+        for e in self:
+            m[e.key] = len(l)
+            l.append(e)
+        return (l, m)
+
+    def postProcessValue(self, val):
+        return val
+
+    def __iter__(self):
+        contents = self.contents
+        offset = 0
+        self.header, offset = self.getHeader(contents, offset)
+        self.footer = ''
+        entity, offset = self.getEntity(contents, offset)
+        while entity:
+            yield entity
+            entity, offset = self.getEntity(contents, offset)
+        f = self.reFooter.match(contents, offset)
+        if f:
+            self.footer = f.group()
+            offset = f.end()
+        if len(contents) > offset:
+            yield Junk(contents, (offset, len(contents)))
+        pass
+
+    def getHeader(self, contents, offset):
+        header = ''
+        h = self.reHeader.match(contents)
+        if h:
+            header = h.group()
+            offset = h.end()
+        return (header, offset)
+
+    def getEntity(self, contents, offset):
+        m = self.reKey.match(contents, offset)
+        if m:
+            offset = m.end()
+            entity = self.createEntity(contents, m)
+            return (entity, offset)
+        # first check if footer has a non-empty match,
+        # 'cause then we don't find junk
+        m = self.reFooter.match(contents, offset)
+        if m and m.end() > offset:
+            return (None, offset)
+        m = self.reKey.search(contents, offset)
+        if m:
+            # we didn't match, but search, so there's junk between offset
+            # and start. We'll match() on the next turn
+            junkend = m.start()
+            return (Junk(contents, (offset, junkend)), junkend)
+        return (None, offset)
+
+    def createEntity(self, contents, m):
+        return Entity(contents, self.postProcessValue,
+                      *[m.span(i) for i in xrange(7)])
+
+
+def getParser(path):
+    for item in __constructors:
+        if re.search(item[0], path):
+            return item[1]
+    raise UserWarning("Cannot find Parser")
+
+
+# Subgroups of the match will:
+# 1: pre white space
+# 2: pre comments
+# 3: entity definition
+# 4: entity key (name)
+# 5: entity value
+# 6: post comment (and white space) in the same line (dtd only)
+#                                            <--[1]
+# <!-- pre comments -->                      <--[2]
+# <!ENTITY key "value"> <!-- comment -->
+#
+# <-------[3]---------><------[6]------>
+
+
+class DTDParser(Parser):
+    # http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar
+    # ":" | [A-Z] | "_" | [a-z] |
+    # [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
+    # | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
+    # [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
+    # [#x10000-#xEFFFF]
+    CharMinusDash = u'\x09\x0A\x0D\u0020-\u002C\u002E-\uD7FF\uE000-\uFFFD'
+    XmlComment = '<!--(?:-?[%s])*?-->' % CharMinusDash
+    NameStartChar = u':A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\u02FF' + \
+        u'\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F' + \
+        u'\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD'
+    # + \U00010000-\U000EFFFF seems to be unsupported in python
+
+    # NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
+    #     [#x0300-#x036F] | [#x203F-#x2040]
+    NameChar = NameStartChar + ur'\-\.0-9' + u'\xB7\u0300-\u036F\u203F-\u2040'
+    Name = '[' + NameStartChar + '][' + NameChar + ']*'
+    reKey = re.compile('(?:(?P<pre>\s*)(?P<precomment>(?:' + XmlComment +
+                       '\s*)*)(?P<entity><!ENTITY\s+(?P<key>' + Name +
+                       ')\s+(?P<val>\"[^\"]*\"|\'[^\']*\'?)\s*>)'
+                       '(?P<post>[ \t]*(?:' + XmlComment + '\s*)*\n?)?)',
+                       re.DOTALL)
+    # add BOM to DTDs, details in bug 435002
+    reHeader = re.compile(u'^\ufeff?'
+                          u'(\s*<!--.*(http://mozilla.org/MPL/2.0/|'
+                          u'LICENSE BLOCK)([^-]+-)*[^-]+-->)?', re.S)
+    reFooter = re.compile('\s*(<!--([^-]+-)*[^-]+-->\s*)*$')
+    rePE = re.compile('(?:(\s*)((?:' + XmlComment + '\s*)*)'
+                      '(<!ENTITY\s+%\s+(' + Name +
+                      ')\s+SYSTEM\s+(\"[^\"]*\"|\'[^\']*\')\s*>\s*%' + Name +
+                      ';)([ \t]*(?:' + XmlComment + '\s*)*\n?)?)')
+
+    def getEntity(self, contents, offset):
+        '''
+        Overload Parser.getEntity to special-case ParsedEntities.
+        Just check for a parsed entity if that method claims junk.
+
+        <!ENTITY % foo SYSTEM "url">
+        %foo;
+        '''
+        entity, inneroffset = Parser.getEntity(self, contents, offset)
+        if (entity and isinstance(entity, Junk)) or entity is None:
+            m = self.rePE.match(contents, offset)
+            if m:
+                inneroffset = m.end()
+                entity = Entity(contents, self.postProcessValue,
+                                *[m.span(i) for i in xrange(7)])
+        return (entity, inneroffset)
+
+    def createEntity(self, contents, m):
+        valspan = m.span('val')
+        valspan = (valspan[0]+1, valspan[1]-1)
+        return Entity(contents, self.postProcessValue, m.span(),
+                      m.span('pre'), m.span('precomment'),
+                      m.span('entity'), m.span('key'), valspan,
+                      m.span('post'))
+
+
+class PropertiesParser(Parser):
+    escape = re.compile(r'\\((?P<uni>u[0-9a-fA-F]{1,4})|'
+                        '(?P<nl>\n\s*)|(?P<single>.))', re.M)
+    known_escapes = {'n': '\n', 'r': '\r', 't': '\t', '\\': '\\'}
+
+    def __init__(self):
+        self.reKey = re.compile('^(\s*)'
+                                '((?:[#!].*?\n\s*)*)'
+                                '([^#!\s\n][^=:\n]*?)\s*[:=][ \t]*', re.M)
+        self.reHeader = re.compile('^\s*([#!].*\s*)+')
+        self.reFooter = re.compile('\s*([#!].*\s*)*$')
+        self._escapedEnd = re.compile(r'\\+$')
+        self._trailingWS = re.compile(r'[ \t]*$')
+        Parser.__init__(self)
+
+    def getHeader(self, contents, offset):
+        header = ''
+        h = self.reHeader.match(contents, offset)
+        if h:
+            candidate = h.group()
+            if 'http://mozilla.org/MPL/2.0/' in candidate or \
+                    'LICENSE BLOCK' in candidate:
+                header = candidate
+                offset = h.end()
+        return (header, offset)
+
+    def getEntity(self, contents, offset):
+        # overwritten to parse values line by line
+        m = self.reKey.match(contents, offset)
+        if m:
+            offset = m.end()
+            while True:
+                endval = nextline = contents.find('\n', offset)
+                if nextline == -1:
+                    endval = offset = len(contents)
+                    break
+                # is newline escaped?
+                _e = self._escapedEnd.search(contents, offset, nextline)
+                offset = nextline + 1
+                if _e is None:
+                    break
+                # backslashes at end of line, if 2*n, not escaped
+                if len(_e.group()) % 2 == 0:
+                    break
+            # strip trailing whitespace
+            ws = self._trailingWS.search(contents, m.end(), offset)
+            if ws:
+                endval -= ws.end() - ws.start()
+            entity = Entity(contents, self.postProcessValue,
+                            (m.start(), offset),   # full span
+                            m.span(1),  # leading whitespan
+                            m.span(2),  # leading comment span
+                            (m.start(3), offset),   # entity def span
+                            m.span(3),   # key span
+                            (m.end(), endval),   # value span
+                            (offset, offset))  # post comment span, empty
+            return (entity, offset)
+        m = self.reKey.search(contents, offset)
+        if m:
+            # we didn't match, but search, so there's junk between offset
+            # and start. We'll match() on the next turn
+            junkend = m.start()
+            return (Junk(contents, (offset, junkend)), junkend)
+        return (None, offset)
+
+    def postProcessValue(self, val):
+
+        def unescape(m):
+            found = m.groupdict()
+            if found['uni']:
+                return unichr(int(found['uni'][1:], 16))
+            if found['nl']:
+                return ''
+            return self.known_escapes.get(found['single'], found['single'])
+        val = self.escape.sub(unescape, val)
+        return val
+
+
+class DefinesParser(Parser):
+    # can't merge, #unfilter needs to be the last item, which we don't support
+    canMerge = False
+
+    def __init__(self):
+        self.reKey = re.compile('^(\s*)((?:^#(?!define\s).*\s*)*)'
+                                '(#define[ \t]+(\w+)[ \t]+(.*?))([ \t]*$\n?)',
+                                re.M)
+        self.reHeader = re.compile('^\s*(#(?!define\s).*\s*)*')
+        self.reFooter = re.compile('\s*(#(?!define\s).*\s*)*$', re.M)
+        Parser.__init__(self)
+
+
+class IniParser(Parser):
+    '''
+    Parse files of the form:
+    # initial comment
+    [cat]
+    whitespace*
+    #comment
+    string=value
+    ...
+    '''
+    def __init__(self):
+        self.reHeader = re.compile('^((?:\s*|[;#].*)\n)*\[.+?\]\n', re.M)
+        self.reKey = re.compile('(\s*)((?:[;#].*\n\s*)*)((.+?)=(.*))(\n?)')
+        self.reFooter = re.compile('\s*([;#].*\s*)*$')
+        Parser.__init__(self)
+
+
+DECL, COMMENT, START, END, CONTENT = range(5)
+
+
+class BookmarksParserInner(HTMLParser):
+
+    class Token(object):
+        _type = None
+        content = ''
+
+        def __str__(self):
+            return self.content
+
+    class DeclToken(Token):
+        _type = DECL
+
+        def __init__(self, decl):
+            self.content = decl
+            pass
+
+        def __str__(self):
+            return '<!%s>' % self.content
+        pass
+
+    class CommentToken(Token):
+        _type = COMMENT
+
+        def __init__(self, comment):
+            self.content = comment
+            pass
+
+        def __str__(self):
+            return '<!--%s-->' % self.content
+        pass
+
+    class StartToken(Token):
+        _type = START
+
+        def __init__(self, tag, attrs, content):
+            self.tag = tag
+            self.attrs = dict(attrs)
+            self.content = content
+            pass
+        pass
+
+    class EndToken(Token):
+        _type = END
+
+        def __init__(self, tag):
+            self.tag = tag
+            pass
+
+        def __str__(self):
+            return '</%s>' % self.tag.upper()
+        pass
+
+    class ContentToken(Token):
+        _type = CONTENT
+
+        def __init__(self, content):
+            self.content = content
+            pass
+        pass
+
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.tokens = []
+
+    def parse(self, contents):
+        self.tokens = []
+        self.feed(contents)
+        self.close()
+        return self.tokens
+
+    # Called when we hit an end DL tag to reset the folder selections
+    def handle_decl(self, decl):
+        self.tokens.append(self.DeclToken(decl))
+
+    # Called when we hit an end DL tag to reset the folder selections
+    def handle_comment(self, comment):
+        self.tokens.append(self.CommentToken(comment))
+
+    def handle_starttag(self, tag, attrs):
+        self.tokens.append(self.StartToken(tag, attrs,
+                                           self.get_starttag_text()))
+
+    # Called when text data is encountered
+    def handle_data(self, data):
+        if self.tokens[-1]._type == CONTENT:
+            self.tokens[-1].content += data
+        else:
+            self.tokens.append(self.ContentToken(data))
+
+    def handle_charref(self, data):
+        self.handle_data('&#%s;' % data)
+
+    def handle_entityref(self, data):
+        self.handle_data('&%s;' % data)
+
+    # Called when we hit an end DL tag to reset the folder selections
+    def handle_endtag(self, tag):
+        self.tokens.append(self.EndToken(tag))
+
+
+class BookmarksParser(Parser):
+    canMerge = False
+
+    class BMEntity(object):
+        def __init__(self, key, val):
+            self.key = key
+            self.val = val
+
+    def __iter__(self):
+        p = BookmarksParserInner()
+        tks = p.parse(self.contents)
+        i = 0
+        k = []
+        for i in xrange(len(tks)):
+            t = tks[i]
+            if t._type == START:
+                k.append(t.tag)
+                keys = t.attrs.keys()
+                keys.sort()
+                for attrname in keys:
+                    yield self.BMEntity('.'.join(k) + '.@' + attrname,
+                                        t.attrs[attrname])
+                if i + 1 < len(tks) and tks[i+1]._type == CONTENT:
+                    i += 1
+                    t = tks[i]
+                    v = t.content.strip()
+                    if v:
+                        yield self.BMEntity('.'.join(k), v)
+            elif t._type == END:
+                k.pop()
+
+
+__constructors = [('\\.dtd$', DTDParser()),
+                  ('\\.properties$', PropertiesParser()),
+                  ('\\.ini$', IniParser()),
+                  ('\\.inc$', DefinesParser()),
+                  ('bookmarks\\.html$', BookmarksParser())]
diff --git a/python/compare-locales/compare_locales/paths.py b/python/compare-locales/compare_locales/paths.py
new file mode 100644
index 000000000..f72b3a2e7
--- /dev/null
+++ b/python/compare-locales/compare_locales/paths.py
@@ -0,0 +1,398 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import os.path
+import os
+from ConfigParser import ConfigParser, NoSectionError, NoOptionError
+from urlparse import urlparse, urljoin
+from urllib import pathname2url, url2pathname
+from urllib2 import urlopen
+from collections import defaultdict
+from compare_locales import util
+
+
+class L10nConfigParser(object):
+    '''Helper class to gather application information from ini files.
+
+    This class is working on synchronous open to read files or web data.
+    Subclass this and overwrite loadConfigs and addChild if you need async.
+    '''
+    def __init__(self, inipath, **kwargs):
+        """Constructor for L10nConfigParsers
+
+        inipath -- l10n.ini path
+        Optional keyword arguments are fowarded to the inner ConfigParser as
+        defaults.
+        """
+        if os.path.isabs(inipath):
+            self.inipath = 'file:%s' % pathname2url(inipath)
+        else:
+            pwdurl = 'file:%s/' % pathname2url(os.getcwd())
+            self.inipath = urljoin(pwdurl, inipath)
+        # l10n.ini files can import other l10n.ini files, store the
+        # corresponding L10nConfigParsers
+        self.children = []
+        # we really only care about the l10n directories described in l10n.ini
+        self.dirs = []
+        # optional defaults to be passed to the inner ConfigParser (unused?)
+        self.defaults = kwargs
+
+    def getDepth(self, cp):
+        '''Get the depth for the comparison from the parsed l10n.ini.
+
+        Overloadable to get the source depth for fennec and friends.
+        '''
+        try:
+            depth = cp.get('general', 'depth')
+        except:
+            depth = '.'
+        return depth
+
+    def getFilters(self):
+        '''Get the test functions from this ConfigParser and all children.
+
+        Only works with synchronous loads, used by compare-locales, which
+        is local anyway.
+        '''
+        filterurl = urljoin(self.inipath, 'filter.py')
+        try:
+            l = {}
+            execfile(url2pathname(urlparse(filterurl).path), {}, l)
+            if 'test' in l and callable(l['test']):
+                filters = [l['test']]
+            else:
+                filters = []
+        except:
+            filters = []
+
+        for c in self.children:
+            filters += c.getFilters()
+
+        return filters
+
+    def loadConfigs(self):
+        """Entry point to load the l10n.ini file this Parser refers to.
+
+        This implementation uses synchronous loads, subclasses might overload
+        this behaviour. If you do, make sure to pass a file-like object
+        to onLoadConfig.
+        """
+        self.onLoadConfig(urlopen(self.inipath))
+
+    def onLoadConfig(self, inifile):
+        """Parse a file-like object for the loaded l10n.ini file."""
+        cp = ConfigParser(self.defaults)
+        cp.readfp(inifile)
+        depth = self.getDepth(cp)
+        self.baseurl = urljoin(self.inipath, depth)
+        # create child loaders for any other l10n.ini files to be included
+        try:
+            for title, path in cp.items('includes'):
+                # skip default items
+                if title in self.defaults:
+                    continue
+                # add child config parser
+                self.addChild(title, path, cp)
+        except NoSectionError:
+            pass
+        # try to load the "dirs" defined in the "compare" section
+        try:
+            self.dirs.extend(cp.get('compare', 'dirs').split())
+        except (NoOptionError, NoSectionError):
+            pass
+        # try getting a top level compare dir, as used for fennec
+        try:
+            self.tld = cp.get('compare', 'tld')
+            # remove tld from comparison dirs
+            if self.tld in self.dirs:
+                self.dirs.remove(self.tld)
+        except (NoOptionError, NoSectionError):
+            self.tld = None
+        # try to set "all_path" and "all_url"
+        try:
+            self.all_path = cp.get('general', 'all')
+            self.all_url = urljoin(self.baseurl, self.all_path)
+        except (NoOptionError, NoSectionError):
+            self.all_path = None
+            self.all_url = None
+        return cp
+
+    def addChild(self, title, path, orig_cp):
+        """Create a child L10nConfigParser and load it.
+
+        title -- indicates the module's name
+        path -- indicates the path to the module's l10n.ini file
+        orig_cp -- the configuration parser of this l10n.ini
+        """
+        cp = L10nConfigParser(urljoin(self.baseurl, path), **self.defaults)
+        cp.loadConfigs()
+        self.children.append(cp)
+
+    def getTLDPathsTuple(self, basepath):
+        """Given the basepath, return the path fragments to be used for
+        self.tld. For build runs, this is (basepath, self.tld), for
+        source runs, just (basepath,).
+
+        @see overwritten method in SourceTreeConfigParser.
+        """
+        return (basepath, self.tld)
+
+    def dirsIter(self):
+        """Iterate over all dirs and our base path for this l10n.ini"""
+        url = urlparse(self.baseurl)
+        basepath = url2pathname(url.path)
+        if self.tld is not None:
+            yield self.tld, self.getTLDPathsTuple(basepath)
+        for dir in self.dirs:
+            yield dir, (basepath, dir)
+
+    def directories(self):
+        """Iterate over all dirs and base paths for this l10n.ini as well
+        as the included ones.
+        """
+        for t in self.dirsIter():
+            yield t
+        for child in self.children:
+            for t in child.directories():
+                yield t
+
+    def allLocales(self):
+        """Return a list of all the locales of this project"""
+        return util.parseLocales(urlopen(self.all_url).read())
+
+
+class SourceTreeConfigParser(L10nConfigParser):
+    '''Subclassing L10nConfigParser to work with just the repos
+    checked out next to each other instead of intermingled like
+    we do for real builds.
+    '''
+
+    def __init__(self, inipath, basepath):
+        '''Add additional arguments basepath.
+
+        basepath is used to resolve local paths via branchnames.
+        '''
+        L10nConfigParser.__init__(self, inipath)
+        self.basepath = basepath
+        self.tld = None
+
+    def getDepth(self, cp):
+        '''Get the depth for the comparison from the parsed l10n.ini.
+
+        Overloaded to get the source depth for fennec and friends.
+        '''
+        try:
+            depth = cp.get('general', 'source-depth')
+        except:
+            try:
+                depth = cp.get('general', 'depth')
+            except:
+                depth = '.'
+        return depth
+
+    def addChild(self, title, path, orig_cp):
+        # check if there's a section with details for this include
+        # we might have to check a different repo, or even VCS
+        # for example, projects like "mail" indicate in
+        # an "include_" section where to find the l10n.ini for "toolkit"
+        details = 'include_' + title
+        if orig_cp.has_section(details):
+            branch = orig_cp.get(details, 'mozilla')
+            inipath = orig_cp.get(details, 'l10n.ini')
+            path = self.basepath + '/' + branch + '/' + inipath
+        else:
+            path = urljoin(self.baseurl, path)
+        cp = SourceTreeConfigParser(path, self.basepath, **self.defaults)
+        cp.loadConfigs()
+        self.children.append(cp)
+
+    def getTLDPathsTuple(self, basepath):
+        """Overwrite L10nConfigParser's getTLDPathsTuple to just return
+        the basepath.
+        """
+        return (basepath, )
+
+
+class File(object):
+
+    def __init__(self, fullpath, file, module=None, locale=None):
+        self.fullpath = fullpath
+        self.file = file
+        self.module = module
+        self.locale = locale
+        pass
+
+    def getContents(self):
+        # open with universal line ending support and read
+        return open(self.fullpath, 'rU').read()
+
+    def __hash__(self):
+        f = self.file
+        if self.module:
+            f = self.module + '/' + f
+        return hash(f)
+
+    def __str__(self):
+        return self.fullpath
+
+    def __cmp__(self, other):
+        if not isinstance(other, File):
+            raise NotImplementedError
+        rv = cmp(self.module, other.module)
+        if rv != 0:
+            return rv
+        return cmp(self.file, other.file)
+
+
+class EnumerateDir(object):
+    ignore_dirs = ['CVS', '.svn', '.hg', '.git']
+
+    def __init__(self, basepath, module='', locale=None, ignore_subdirs=[]):
+        self.basepath = basepath
+        self.module = module
+        self.locale = locale
+        self.ignore_subdirs = ignore_subdirs
+        pass
+
+    def cloneFile(self, other):
+        '''
+        Return a File object that this enumerator would return, if it had it.
+        '''
+        return File(os.path.join(self.basepath, other.file), other.file,
+                    self.module, self.locale)
+
+    def __iter__(self):
+        # our local dirs are given as a tuple of path segments, starting off
+        # with an empty sequence for the basepath.
+        dirs = [()]
+        while dirs:
+            dir = dirs.pop(0)
+            fulldir = os.path.join(self.basepath, *dir)
+            try:
+                entries = os.listdir(fulldir)
+            except OSError:
+                # we probably just started off in a non-existing dir, ignore
+                continue
+            entries.sort()
+            for entry in entries:
+                leaf = os.path.join(fulldir, entry)
+                if os.path.isdir(leaf):
+                    if entry not in self.ignore_dirs and \
+                        leaf not in [os.path.join(self.basepath, d)
+                                     for d in self.ignore_subdirs]:
+                        dirs.append(dir + (entry,))
+                    continue
+                yield File(leaf, '/'.join(dir + (entry,)),
+                           self.module, self.locale)
+
+
+class LocalesWrap(object):
+
+    def __init__(self, base, module, locales, ignore_subdirs=[]):
+        self.base = base
+        self.module = module
+        self.locales = locales
+        self.ignore_subdirs = ignore_subdirs
+
+    def __iter__(self):
+        for locale in self.locales:
+            path = os.path.join(self.base, locale, self.module)
+            yield (locale, EnumerateDir(path, self.module, locale,
+                                        self.ignore_subdirs))
+
+
+class EnumerateApp(object):
+    reference = 'en-US'
+
+    def __init__(self, inipath, l10nbase, locales=None):
+        self.setupConfigParser(inipath)
+        self.modules = defaultdict(dict)
+        self.l10nbase = os.path.abspath(l10nbase)
+        self.filters = []
+        drive, tail = os.path.splitdrive(inipath)
+        self.addFilters(*self.config.getFilters())
+        self.locales = locales or self.config.allLocales()
+        self.locales.sort()
+
+    def setupConfigParser(self, inipath):
+        self.config = L10nConfigParser(inipath)
+        self.config.loadConfigs()
+
+    def addFilters(self, *args):
+        self.filters += args
+
+    value_map = {None: None, 'error': 0, 'ignore': 1, 'report': 2}
+
+    def filter(self, l10n_file, entity=None):
+        '''Go through all added filters, and,
+        - map "error" -> 0, "ignore" -> 1, "report" -> 2
+        - if filter.test returns a bool, map that to
+            False -> "ignore" (1), True -> "error" (0)
+        - take the max of all reported
+        '''
+        rv = 0
+        for f in reversed(self.filters):
+            try:
+                _r = f(l10n_file.module, l10n_file.file, entity)
+            except:
+                # XXX error handling
+                continue
+            if isinstance(_r, bool):
+                _r = [1, 0][_r]
+            else:
+                # map string return value to int, default to 'error',
+                # None is None
+                _r = self.value_map.get(_r, 0)
+            if _r is not None:
+                rv = max(rv, _r)
+        return ['error', 'ignore', 'report'][rv]
+
+    def __iter__(self):
+        '''
+        Iterate over all modules, return en-US directory enumerator, and an
+        iterator over all locales in each iteration. Per locale, the locale
+        code and an directory enumerator will be given.
+        '''
+        dirmap = dict(self.config.directories())
+        mods = dirmap.keys()
+        mods.sort()
+        for mod in mods:
+            if self.reference == 'en-US':
+                base = os.path.join(*(dirmap[mod] + ('locales', 'en-US')))
+            else:
+                base = os.path.join(self.l10nbase, self.reference, mod)
+            yield (mod, EnumerateDir(base, mod, self.reference),
+                   LocalesWrap(self.l10nbase, mod, self.locales,
+                   [m[len(mod)+1:] for m in mods if m.startswith(mod+'/')]))
+
+
+class EnumerateSourceTreeApp(EnumerateApp):
+    '''Subclass EnumerateApp to work on side-by-side checked out
+    repos, and to no pay attention to how the source would actually
+    be checked out for building.
+
+    It's supporting applications like Fennec, too, which have
+    'locales/en-US/...' in their root dir, but claim to be 'mobile'.
+    '''
+
+    def __init__(self, inipath, basepath, l10nbase, locales=None):
+        self.basepath = basepath
+        EnumerateApp.__init__(self, inipath, l10nbase, locales)
+
+    def setupConfigParser(self, inipath):
+        self.config = SourceTreeConfigParser(inipath, self.basepath)
+        self.config.loadConfigs()
+
+
+def get_base_path(mod, loc):
+    'statics for path patterns and conversion'
+    __l10n = 'l10n/%(loc)s/%(mod)s'
+    __en_US = 'mozilla/%(mod)s/locales/en-US'
+    if loc == 'en-US':
+        return __en_US % {'mod': mod}
+    return __l10n % {'mod': mod, 'loc': loc}
+
+
+def get_path(mod, loc, leaf):
+    return get_base_path(mod, loc) + '/' + leaf
diff --git a/python/compare-locales/compare_locales/tests/__init__.py b/python/compare-locales/compare_locales/tests/__init__.py
new file mode 100644
index 000000000..8808d78f4
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/__init__.py
@@ -0,0 +1,49 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+'''Mixins for parser tests.
+'''
+
+from itertools import izip_longest
+from pkg_resources import resource_string
+import re
+
+from compare_locales.parser import getParser
+
+
+class ParserTestMixin():
+    '''Utility methods used by the parser tests.
+    '''
+    filename = None
+
+    def setUp(self):
+        '''Create a parser for this test.
+        '''
+        self.parser = getParser(self.filename)
+
+    def tearDown(self):
+        'tear down this test'
+        del self.parser
+
+    def resource(self, name):
+        testcontent = resource_string(__name__, 'data/' + name)
+        # fake universal line endings
+        testcontent = re.sub('\r\n?', lambda m: '\n', testcontent)
+        return testcontent
+
+    def _test(self, content, refs):
+        '''Helper to test the parser.
+        Compares the result of parsing content with the given list
+        of reference keys and values.
+        '''
+        self.parser.readContents(content)
+        entities = [entity for entity in self.parser]
+        for entity, ref in izip_longest(entities, refs):
+            self.assertTrue(entity, 'excess reference entity')
+            self.assertTrue(ref, 'excess parsed entity')
+            self.assertEqual(entity.val, ref[1])
+            if ref[0].startswith('_junk'):
+                self.assertTrue(re.match(ref[0], entity.key))
+            else:
+                self.assertEqual(entity.key, ref[0])
diff --git a/python/compare-locales/compare_locales/tests/data/bug121341.properties b/python/compare-locales/compare_locales/tests/data/bug121341.properties
new file mode 100644
index 000000000..b45fc9698
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/data/bug121341.properties
@@ -0,0 +1,68 @@
+# simple check
+1=abc
+# test whitespace trimming in key and value
+  2	=   xy	
+# test parsing of escaped values
+3 = \u1234\t\r\n\uAB\
+\u1\n
+# test multiline properties
+4 = this is \
+multiline property
+5 = this is \
+	   another multiline property
+# property with DOS EOL
+6 = test\u0036
+# test multiline property with with DOS EOL
+7 = yet another multi\
+    line propery
+# trimming should not trim escaped whitespaces
+8 =	\ttest5\u0020	
+# another variant of #8
+9 =     \ test6\t	    
+# test UTF-8 encoded property/value
+10aሴb = c췯d
+# next property should test unicode escaping at the boundary of parsing buffer
+# buffer size is expected to be 4096 so add comments to get to this offset
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+###############################################################################
+11 = \uABCD
diff --git a/python/compare-locales/compare_locales/tests/data/test.properties b/python/compare-locales/compare_locales/tests/data/test.properties
new file mode 100644
index 000000000..19cae9702
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/data/test.properties
@@ -0,0 +1,14 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+1=1
+ 2=2
+3 =3
+ 4 =4
+5=5
+6= 6
+7=7 
+8= 8 
+# this is a comment
+9=this is the first part of a continued line \
+ and here is the 2nd part
diff --git a/python/compare-locales/compare_locales/tests/data/triple-license.dtd b/python/compare-locales/compare_locales/tests/data/triple-license.dtd
new file mode 100644
index 000000000..4a28b17a6
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/data/triple-license.dtd
@@ -0,0 +1,38 @@
+<!-- ***** BEGIN LICENSE BLOCK *****
+#if 0
+   - Version: MPL 1.1/GPL 2.0/LGPL 2.1
+   -
+   - The contents of this file are subject to the Mozilla Public License Version
+   - 1.1 (the "License"); you may not use this file except in compliance with
+   - the License. You may obtain a copy of the License at
+   - http://www.mozilla.org/MPL/
+   -
+   - Software distributed under the License is distributed on an "AS IS" basis,
+   - WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+   - for the specific language governing rights and limitations under the
+   - License.
+   -
+   - The Original Code is mozilla.org Code.
+   -
+   - The Initial Developer of the Original Code is dummy.
+   - Portions created by the Initial Developer are Copyright (C) 2005
+   - the Initial Developer. All Rights Reserved.
+   -
+   - Contributor(s):
+   -
+   - Alternatively, the contents of this file may be used under the terms of
+   - either the GNU General Public License Version 2 or later (the "GPL"), or
+   - the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+   - in which case the provisions of the GPL or the LGPL are applicable instead
+   - of those above. If you wish to allow use of your version of this file only
+   - under the terms of either the GPL or the LGPL, and not to allow others to
+   - use your version of this file under the terms of the MPL, indicate your
+   - decision by deleting the provisions above and replace them with the notice
+   - and other provisions required by the LGPL or the GPL. If you do not delete
+   - the provisions above, a recipient may use your version of this file under
+   - the terms of any one of the MPL, the GPL or the LGPL.
+   -
+#endif
+   - ***** END LICENSE BLOCK ***** -->
+
+<!ENTITY foo "value">
diff --git a/python/compare-locales/compare_locales/tests/test_checks.py b/python/compare-locales/compare_locales/tests/test_checks.py
new file mode 100644
index 000000000..b995d43f9
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_checks.py
@@ -0,0 +1,403 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales.checks import getChecker
+from compare_locales.parser import getParser, Entity
+from compare_locales.paths import File
+
+
+class BaseHelper(unittest.TestCase):
+    file = None
+    refContent = None
+
+    def setUp(self):
+        p = getParser(self.file.file)
+        p.readContents(self.refContent)
+        self.refList, self.refMap = p.parse()
+
+    def _test(self, content, refWarnOrErrors, with_ref_file=False):
+        p = getParser(self.file.file)
+        p.readContents(content)
+        l10n = [e for e in p]
+        assert len(l10n) == 1
+        l10n = l10n[0]
+        if with_ref_file:
+            kwargs = {
+                'reference': self.refList
+            }
+        else:
+            kwargs = {}
+        checker = getChecker(self.file, **kwargs)
+        ref = self.refList[self.refMap[l10n.key]]
+        found = tuple(checker.check(ref, l10n))
+        self.assertEqual(found, refWarnOrErrors)
+
+
+class TestProperties(BaseHelper):
+    file = File('foo.properties', 'foo.properties')
+    refContent = '''some = value
+'''
+
+    def testGood(self):
+        self._test('''some = localized''',
+                   tuple())
+
+    def testMissedEscape(self):
+        self._test(r'''some = \u67ood escape, bad \escape''',
+                   (('warning', 20, r'unknown escape sequence, \e',
+                     'escape'),))
+
+
+class TestPlurals(BaseHelper):
+    file = File('foo.properties', 'foo.properties')
+    refContent = '''\
+# LOCALIZATION NOTE (downloadsTitleFiles): Semi-colon list of plural forms.
+# See: http://developer.mozilla.org/en/docs/Localization_and_Plurals
+# #1 number of files
+# example: 111 files - Downloads
+downloadsTitleFiles=#1 file - Downloads;#1 files - #2
+'''
+
+    def testGood(self):
+        self._test('''\
+# LOCALIZATION NOTE (downloadsTitleFiles): Semi-colon list of plural forms.
+# See: http://developer.mozilla.org/en/docs/Localization_and_Plurals
+# #1 number of files
+# example: 111 files - Downloads
+downloadsTitleFiles=#1 file - Downloads;#1 files - #2;#1 filers
+''',
+                   tuple())
+
+    def testNotUsed(self):
+        self._test('''\
+# LOCALIZATION NOTE (downloadsTitleFiles): Semi-colon list of plural forms.
+# See: http://developer.mozilla.org/en/docs/Localization_and_Plurals
+# #1 number of files
+# example: 111 files - Downloads
+downloadsTitleFiles=#1 file - Downloads;#1 files - Downloads;#1 filers
+''',
+                   (('warning', 0, 'not all variables used in l10n',
+                     'plural'),))
+
+    def testNotDefined(self):
+        self._test('''\
+# LOCALIZATION NOTE (downloadsTitleFiles): Semi-colon list of plural forms.
+# See: http://developer.mozilla.org/en/docs/Localization_and_Plurals
+# #1 number of files
+# example: 111 files - Downloads
+downloadsTitleFiles=#1 file - Downloads;#1 files - #2;#1 #3
+''',
+                   (('error', 0, 'unreplaced variables in l10n', 'plural'),))
+
+
+class TestDTDs(BaseHelper):
+    file = File('foo.dtd', 'foo.dtd')
+    refContent = '''<!ENTITY foo "This is &apos;good&apos;">
+<!ENTITY width "10ch">
+<!ENTITY style "width: 20ch; height: 280px;">
+<!ENTITY minStyle "min-height: 50em;">
+<!ENTITY ftd "0">
+<!ENTITY formatPercent "This is 100&#037; correct">
+<!ENTITY some.key "K">
+'''
+
+    def testWarning(self):
+        self._test('''<!ENTITY foo "This is &not; good">
+''',
+                   (('warning', (0, 0), 'Referencing unknown entity `not`',
+                     'xmlparse'),))
+        # make sure we only handle translated entity references
+        self._test(u'''<!ENTITY foo "This is &ƞǿŧ; good">
+'''.encode('utf-8'),
+            (('warning', (0, 0), u'Referencing unknown entity `ƞǿŧ`',
+              'xmlparse'),))
+
+    def testErrorFirstLine(self):
+        self._test('''<!ENTITY foo "This is </bad> stuff">
+''',
+                   (('error', (1, 10), 'mismatched tag', 'xmlparse'),))
+
+    def testErrorSecondLine(self):
+        self._test('''<!ENTITY foo "This is
+  </bad>
+stuff">
+''',
+                   (('error', (2, 4), 'mismatched tag', 'xmlparse'),))
+
+    def testKeyErrorSingleAmpersand(self):
+        self._test('''<!ENTITY some.key "&">
+''',
+                   (('error', (1, 1), 'not well-formed (invalid token)',
+                     'xmlparse'),))
+
+    def testXMLEntity(self):
+        self._test('''<!ENTITY foo "This is &quot;good&quot;">
+''',
+                   tuple())
+
+    def testPercentEntity(self):
+        self._test('''<!ENTITY formatPercent "Another 100&#037;">
+''',
+                   tuple())
+        self._test('''<!ENTITY formatPercent "Bad 100% should fail">
+''',
+                   (('error', (0, 32), 'not well-formed (invalid token)',
+                     'xmlparse'),))
+
+    def testNoNumber(self):
+        self._test('''<!ENTITY ftd "foo">''',
+                   (('warning', 0, 'reference is a number', 'number'),))
+
+    def testNoLength(self):
+        self._test('''<!ENTITY width "15miles">''',
+                   (('error', 0, 'reference is a CSS length', 'css'),))
+
+    def testNoStyle(self):
+        self._test('''<!ENTITY style "15ch">''',
+                   (('error', 0, 'reference is a CSS spec', 'css'),))
+        self._test('''<!ENTITY style "junk">''',
+                   (('error', 0, 'reference is a CSS spec', 'css'),))
+
+    def testStyleWarnings(self):
+        self._test('''<!ENTITY style "width:15ch">''',
+                   (('warning', 0, 'height only in reference', 'css'),))
+        self._test('''<!ENTITY style "width:15em;height:200px;">''',
+                   (('warning', 0, "units for width don't match (em != ch)",
+                     'css'),))
+
+    def testNoWarning(self):
+        self._test('''<!ENTITY width "12em">''', tuple())
+        self._test('''<!ENTITY style "width:12ch;height:200px;">''', tuple())
+        self._test('''<!ENTITY ftd "0">''', tuple())
+
+
+class TestEntitiesInDTDs(BaseHelper):
+    file = File('foo.dtd', 'foo.dtd')
+    refContent = '''<!ENTITY short "This is &brandShortName;">
+<!ENTITY shorter "This is &brandShorterName;">
+<!ENTITY ent.start "Using &brandShorterName; start to">
+<!ENTITY ent.end " end">
+'''
+
+    def testOK(self):
+        self._test('''<!ENTITY ent.start "Mit &brandShorterName;">''', tuple(),
+                   with_ref_file=True)
+
+    def testMismatch(self):
+        self._test('''<!ENTITY ent.start "Mit &brandShortName;">''',
+                   (('warning', (0, 0),
+                     'Entity brandShortName referenced, '
+                     'but brandShorterName used in context',
+                     'xmlparse'),),
+                   with_ref_file=True)
+
+    def testAcross(self):
+        self._test('''<!ENTITY ent.end "Mit &brandShorterName;">''',
+                   tuple(),
+                   with_ref_file=True)
+
+    def testAcrossWithMismatch(self):
+        '''If we could tell that ent.start and ent.end are one string,
+        we should warn. Sadly, we can't, so this goes without warning.'''
+        self._test('''<!ENTITY ent.end "Mit &brandShortName;">''',
+                   tuple(),
+                   with_ref_file=True)
+
+    def testUnknownWithRef(self):
+        self._test('''<!ENTITY ent.start "Mit &foopy;">''',
+                   (('warning',
+                     (0, 0),
+                     'Referencing unknown entity `foopy` '
+                     '(brandShorterName used in context, '
+                     'brandShortName known)',
+                     'xmlparse'),),
+                   with_ref_file=True)
+
+    def testUnknown(self):
+        self._test('''<!ENTITY ent.end "Mit &foopy;">''',
+                   (('warning',
+                     (0, 0),
+                     'Referencing unknown entity `foopy`'
+                     ' (brandShortName, brandShorterName known)',
+                     'xmlparse'),),
+                   with_ref_file=True)
+
+
+class TestAndroid(unittest.TestCase):
+    """Test Android checker
+
+    Make sure we're hitting our extra rules only if
+    we're passing in a DTD file in the embedding/android module.
+    """
+    apos_msg = u"Apostrophes in Android DTDs need escaping with \\' or " + \
+               u"\\u0027, or use \u2019, or put string in quotes."
+    quot_msg = u"Quotes in Android DTDs need escaping with \\\" or " + \
+               u"\\u0022, or put string in apostrophes."
+
+    def getEntity(self, v):
+        return Entity(v, lambda s: s, (0, len(v)), (), (0, 0), (), (),
+                      (0, len(v)), ())
+
+    def getDTDEntity(self, v):
+        v = v.replace('"', '&quot;')
+        return Entity('<!ENTITY foo "%s">' % v,
+                      lambda s: s,
+                      (0, len(v) + 16), (), (0, 0), (), (9, 12),
+                      (14, len(v) + 14), ())
+
+    def test_android_dtd(self):
+        """Testing the actual android checks. The logic is involved,
+        so this is a lot of nitty gritty detail tests.
+        """
+        f = File("embedding/android/strings.dtd", "strings.dtd",
+                 "embedding/android")
+        checker = getChecker(f)
+        # good string
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # dtd warning
+        l10n = self.getDTDEntity("plain localized string &ref;")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('warning', (0, 0),
+                           'Referencing unknown entity `ref`', 'xmlparse'),))
+        # no report on stray ampersand or quote, if not completely quoted
+        for i in xrange(3):
+            # make sure we're catching unescaped apostrophes,
+            # try 0..5 backticks
+            l10n = self.getDTDEntity("\\"*(2*i) + "'")
+            self.assertEqual(tuple(checker.check(ref, l10n)),
+                             (('error', 2*i, self.apos_msg, 'android'),))
+            l10n = self.getDTDEntity("\\"*(2*i + 1) + "'")
+            self.assertEqual(tuple(checker.check(ref, l10n)),
+                             ())
+            # make sure we don't report if apos string is quoted
+            l10n = self.getDTDEntity('"' + "\\"*(2*i) + "'\"")
+            tpl = tuple(checker.check(ref, l10n))
+            self.assertEqual(tpl, (),
+                             "`%s` shouldn't fail but got %s"
+                             % (l10n.val, str(tpl)))
+            l10n = self.getDTDEntity('"' + "\\"*(2*i+1) + "'\"")
+            tpl = tuple(checker.check(ref, l10n))
+            self.assertEqual(tpl, (),
+                             "`%s` shouldn't fail but got %s"
+                             % (l10n.val, str(tpl)))
+            # make sure we're catching unescaped quotes, try 0..5 backticks
+            l10n = self.getDTDEntity("\\"*(2*i) + "\"")
+            self.assertEqual(tuple(checker.check(ref, l10n)),
+                             (('error', 2*i, self.quot_msg, 'android'),))
+            l10n = self.getDTDEntity("\\"*(2*i + 1) + "'")
+            self.assertEqual(tuple(checker.check(ref, l10n)),
+                             ())
+            # make sure we don't report if quote string is single quoted
+            l10n = self.getDTDEntity("'" + "\\"*(2*i) + "\"'")
+            tpl = tuple(checker.check(ref, l10n))
+            self.assertEqual(tpl, (),
+                             "`%s` shouldn't fail but got %s" %
+                             (l10n.val, str(tpl)))
+            l10n = self.getDTDEntity('"' + "\\"*(2*i+1) + "'\"")
+            tpl = tuple(checker.check(ref, l10n))
+            self.assertEqual(tpl, (),
+                             "`%s` shouldn't fail but got %s" %
+                             (l10n.val, str(tpl)))
+        # check for mixed quotes and ampersands
+        l10n = self.getDTDEntity("'\"")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 0, self.apos_msg, 'android'),
+                          ('error', 1, self.quot_msg, 'android')))
+        l10n = self.getDTDEntity("''\"'")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 1, self.apos_msg, 'android'),))
+        l10n = self.getDTDEntity('"\'""')
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 2, self.quot_msg, 'android'),))
+
+        # broken unicode escape
+        l10n = self.getDTDEntity("Some broken \u098 unicode")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 12, 'truncated \\uXXXX escape',
+                           'android'),))
+        # broken unicode escape, try to set the error off
+        l10n = self.getDTDEntity(u"\u9690"*14+"\u006"+"  "+"\u0064")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 14, 'truncated \\uXXXX escape',
+                           'android'),))
+
+    def test_android_prop(self):
+        f = File("embedding/android/strings.properties", "strings.properties",
+                 "embedding/android")
+        checker = getChecker(f)
+        # good plain string
+        ref = self.getEntity("plain string")
+        l10n = self.getEntity("plain localized string")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # no dtd warning
+        ref = self.getEntity("plain string")
+        l10n = self.getEntity("plain localized string &ref;")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # no report on stray ampersand
+        ref = self.getEntity("plain string")
+        l10n = self.getEntity("plain localized string with apos: '")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # report on bad printf
+        ref = self.getEntity("string with %s")
+        l10n = self.getEntity("string with %S")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 0, 'argument 1 `S` should be `s`',
+                           'printf'),))
+
+    def test_non_android_dtd(self):
+        f = File("browser/strings.dtd", "strings.dtd", "browser")
+        checker = getChecker(f)
+        # good string
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # dtd warning
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string &ref;")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('warning', (0, 0),
+                          'Referencing unknown entity `ref`', 'xmlparse'),))
+        # no report on stray ampersand
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string with apos: '")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+
+    def test_entities_across_dtd(self):
+        f = File("browser/strings.dtd", "strings.dtd", "browser")
+        p = getParser(f.file)
+        p.readContents('<!ENTITY other "some &good.ref;">')
+        ref = p.parse()
+        checker = getChecker(f, reference=ref[0])
+        # good string
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # dtd warning
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string &ref;")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('warning', (0, 0),
+                           'Referencing unknown entity `ref` (good.ref known)',
+                           'xmlparse'),))
+        # no report on stray ampersand
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string with &good.ref;")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/compare-locales/compare_locales/tests/test_compare.py b/python/compare-locales/compare_locales/tests/test_compare.py
new file mode 100644
index 000000000..51ba7cd8c
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_compare.py
@@ -0,0 +1,90 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales import compare
+
+
+class TestTree(unittest.TestCase):
+    '''Test the Tree utility class
+
+    Tree value classes need to be in-place editable
+    '''
+
+    def test_empty_dict(self):
+        tree = compare.Tree(dict)
+        self.assertEqual(list(tree.getContent()), [])
+        self.assertDictEqual(
+            tree.toJSON(),
+            {}
+        )
+
+    def test_disjoint_dict(self):
+        tree = compare.Tree(dict)
+        tree['one/entry']['leaf'] = 1
+        tree['two/other']['leaf'] = 2
+        self.assertEqual(
+            list(tree.getContent()),
+            [
+                (0, 'key', ('one', 'entry')),
+                (1, 'value', {'leaf': 1}),
+                (0, 'key', ('two', 'other')),
+                (1, 'value', {'leaf': 2})
+            ]
+        )
+        self.assertDictEqual(
+            tree.toJSON(),
+            {
+                'children': [
+                    ('one/entry',
+                     {'value': {'leaf': 1}}
+                     ),
+                    ('two/other',
+                     {'value': {'leaf': 2}}
+                     )
+                ]
+            }
+        )
+        self.assertMultiLineEqual(
+            str(tree),
+            '''\
+one/entry
+    {'leaf': 1}
+two/other
+    {'leaf': 2}\
+'''
+        )
+
+    def test_overlapping_dict(self):
+        tree = compare.Tree(dict)
+        tree['one/entry']['leaf'] = 1
+        tree['one/other']['leaf'] = 2
+        self.assertEqual(
+            list(tree.getContent()),
+            [
+                (0, 'key', ('one',)),
+                (1, 'key', ('entry',)),
+                (2, 'value', {'leaf': 1}),
+                (1, 'key', ('other',)),
+                (2, 'value', {'leaf': 2})
+            ]
+        )
+        self.assertDictEqual(
+            tree.toJSON(),
+            {
+                'children': [
+                    ('one', {
+                        'children': [
+                            ('entry',
+                             {'value': {'leaf': 1}}
+                             ),
+                            ('other',
+                             {'value': {'leaf': 2}}
+                             )
+                        ]
+                    })
+                ]
+            }
+        )
diff --git a/python/compare-locales/compare_locales/tests/test_dtd.py b/python/compare-locales/compare_locales/tests/test_dtd.py
new file mode 100644
index 000000000..87ddcde30
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_dtd.py
@@ -0,0 +1,86 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+'''Tests for the DTD parser.
+'''
+
+import unittest
+import re
+
+from compare_locales.parser import getParser
+from compare_locales.tests import ParserTestMixin
+
+
+class TestDTD(ParserTestMixin, unittest.TestCase):
+    '''Tests for the DTD Parser.'''
+    filename = 'foo.dtd'
+
+    def test_one_entity(self):
+        self._test('''<!ENTITY foo.label "stuff">''',
+                   (('foo.label', 'stuff'),))
+
+    quoteContent = '''<!ENTITY good.one "one">
+<!ENTITY bad.one "bad " quote">
+<!ENTITY good.two "two">
+<!ENTITY bad.two "bad "quoted" word">
+<!ENTITY good.three "three">
+<!ENTITY good.four "good ' quote">
+<!ENTITY good.five "good 'quoted' word">
+'''
+    quoteRef = (
+        ('good.one', 'one'),
+        ('_junk_\\d_25-56$', '<!ENTITY bad.one "bad " quote">'),
+        ('good.two', 'two'),
+        ('_junk_\\d_82-119$', '<!ENTITY bad.two "bad "quoted" word">'),
+        ('good.three', 'three'),
+        ('good.four', 'good \' quote'),
+        ('good.five', 'good \'quoted\' word'),)
+
+    def test_quotes(self):
+        self._test(self.quoteContent, self.quoteRef)
+
+    def test_apos(self):
+        qr = re.compile('[\'"]', re.M)
+
+        def quot2apos(s):
+            return qr.sub(lambda m: m.group(0) == '"' and "'" or '"', s)
+
+        self._test(quot2apos(self.quoteContent),
+                   map(lambda t: (t[0], quot2apos(t[1])), self.quoteRef))
+
+    def test_parsed_ref(self):
+        self._test('''<!ENTITY % fooDTD SYSTEM "chrome://brand.dtd">
+  %fooDTD;
+''',
+                   (('fooDTD', '"chrome://brand.dtd"'),))
+
+    def test_trailing_comment(self):
+        self._test('''<!ENTITY first "string">
+<!ENTITY second "string">
+<!--
+<!ENTITY commented "out">
+-->
+''',
+                   (('first', 'string'), ('second', 'string')))
+
+    def test_license_header(self):
+        p = getParser('foo.dtd')
+        p.readContents(self.resource('triple-license.dtd'))
+        for e in p:
+            self.assertEqual(e.key, 'foo')
+            self.assertEqual(e.val, 'value')
+        self.assert_('MPL' in p.header)
+        p.readContents('''\
+<!-- This Source Code Form is subject to the terms of the Mozilla Public
+   - License, v. 2.0. If a copy of the MPL was not distributed with this file,
+   - You can obtain one at http://mozilla.org/MPL/2.0/.  -->
+<!ENTITY foo "value">
+''')
+        for e in p:
+            self.assertEqual(e.key, 'foo')
+            self.assertEqual(e.val, 'value')
+        self.assert_('MPL' in p.header)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/compare-locales/compare_locales/tests/test_ini.py b/python/compare-locales/compare_locales/tests/test_ini.py
new file mode 100644
index 000000000..4c8cc03e1
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_ini.py
@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales.tests import ParserTestMixin
+
+
+mpl2 = '''\
+; This Source Code Form is subject to the terms of the Mozilla Public
+; License, v. 2.0. If a copy of the MPL was not distributed with this file,
+; You can obtain one at http://mozilla.org/MPL/2.0/.
+'''
+
+
+class TestIniParser(ParserTestMixin, unittest.TestCase):
+
+    filename = 'foo.ini'
+
+    def testSimpleHeader(self):
+        self._test('''; This file is in the UTF-8 encoding
+[Strings]
+TitleText=Some Title
+''', (('TitleText', 'Some Title'),))
+        self.assert_('UTF-8' in self.parser.header)
+
+    def testMPL2_Space_UTF(self):
+        self._test(mpl2 + '''
+; This file is in the UTF-8 encoding
+[Strings]
+TitleText=Some Title
+''', (('TitleText', 'Some Title'),))
+        self.assert_('MPL' in self.parser.header)
+
+    def testMPL2_Space(self):
+        self._test(mpl2 + '''
+[Strings]
+TitleText=Some Title
+''', (('TitleText', 'Some Title'),))
+        self.assert_('MPL' in self.parser.header)
+
+    def testMPL2_MultiSpace(self):
+        self._test(mpl2 + '''\
+
+; more comments
+
+[Strings]
+TitleText=Some Title
+''', (('TitleText', 'Some Title'),))
+        self.assert_('MPL' in self.parser.header)
+
+    def testMPL2_JunkBeforeCategory(self):
+        self._test(mpl2 + '''\
+Junk
+[Strings]
+TitleText=Some Title
+''', (('_junk_\\d+_0-213$', mpl2 + '''\
+Junk
+[Strings]'''), ('TitleText', 'Some Title')))
+        self.assert_('MPL' not in self.parser.header)
+
+    def test_TrailingComment(self):
+        self._test(mpl2 + '''
+[Strings]
+TitleText=Some Title
+;Stray trailing comment
+''', (('TitleText', 'Some Title'),))
+        self.assert_('MPL' in self.parser.header)
+
+    def test_SpacedTrailingComments(self):
+        self._test(mpl2 + '''
+[Strings]
+TitleText=Some Title
+
+;Stray trailing comment
+;Second stray comment
+
+''', (('TitleText', 'Some Title'),))
+        self.assert_('MPL' in self.parser.header)
+
+    def test_TrailingCommentsAndJunk(self):
+        self._test(mpl2 + '''
+[Strings]
+TitleText=Some Title
+
+;Stray trailing comment
+Junk
+;Second stray comment
+
+''', (('TitleText', 'Some Title'), ('_junk_\\d+_231-284$', '''\
+
+;Stray trailing comment
+Junk
+;Second stray comment
+
+''')))
+        self.assert_('MPL' in self.parser.header)
+
+    def test_JunkInbetweenEntries(self):
+        self._test(mpl2 + '''
+[Strings]
+TitleText=Some Title
+
+Junk
+
+Good=other string
+''', (('TitleText', 'Some Title'), ('_junk_\\d+_231-236$', '''\
+
+Junk'''), ('Good', 'other string')))
+        self.assert_('MPL' in self.parser.header)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/compare-locales/compare_locales/tests/test_merge.py b/python/compare-locales/compare_locales/tests/test_merge.py
new file mode 100644
index 000000000..c006edbb5
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_merge.py
@@ -0,0 +1,265 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+import os
+from tempfile import mkdtemp
+import shutil
+
+from compare_locales.parser import getParser
+from compare_locales.paths import File
+from compare_locales.compare import ContentComparer
+
+
+class ContentMixin(object):
+    maxDiff = None  # we got big dictionaries to compare
+    extension = None  # OVERLOAD
+
+    def reference(self, content):
+        self.ref = os.path.join(self.tmp, "en-reference" + self.extension)
+        open(self.ref, "w").write(content)
+
+    def localized(self, content):
+        self.l10n = os.path.join(self.tmp, "l10n" + self.extension)
+        open(self.l10n, "w").write(content)
+
+
+class TestProperties(unittest.TestCase, ContentMixin):
+    extension = '.properties'
+
+    def setUp(self):
+        self.tmp = mkdtemp()
+        os.mkdir(os.path.join(self.tmp, "merge"))
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp)
+        del self.tmp
+
+    def testGood(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        self.reference("""foo = fooVal
+bar = barVal
+eff = effVal""")
+        self.localized("""foo = lFoo
+bar = lBar
+eff = lEff
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.properties", ""),
+                   File(self.l10n, "l10n.properties", ""))
+        self.assertDictEqual(
+            cc.observer.toJSON(),
+            {'summary':
+                {None: {
+                    'changed': 3
+                }},
+             'details': {}
+             }
+        )
+        self.assert_(not os.path.exists(os.path.join(cc.merge_stage,
+                                                     'l10n.properties')))
+
+    def testMissing(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        self.reference("""foo = fooVal
+bar = barVal
+eff = effVal""")
+        self.localized("""bar = lBar
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.properties", ""),
+                   File(self.l10n, "l10n.properties", ""))
+        self.assertDictEqual(
+            cc.observer.toJSON(),
+            {'summary':
+                {None: {
+                    'changed': 1, 'missing': 2
+                }},
+             'details': {
+                 'children': [
+                     ('l10n.properties',
+                         {'value': {'missingEntity': [u'eff', u'foo']}}
+                      )
+                 ]}
+             }
+        )
+        mergefile = os.path.join(self.tmp, "merge", "l10n.properties")
+        self.assertTrue(os.path.isfile(mergefile))
+        p = getParser(mergefile)
+        p.readFile(mergefile)
+        [m, n] = p.parse()
+        self.assertEqual(map(lambda e: e.key,  m), ["bar", "eff", "foo"])
+
+    def testError(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        self.reference("""foo = fooVal
+bar = %d barVal
+eff = effVal""")
+        self.localized("""bar = %S lBar
+eff = leffVal
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.properties", ""),
+                   File(self.l10n, "l10n.properties", ""))
+        self.assertDictEqual(
+            cc.observer.toJSON(),
+            {'summary':
+                {None: {
+                    'changed': 2, 'errors': 1, 'missing': 1
+                }},
+             'details': {
+                 'children': [
+                     ('l10n.properties',
+                         {'value': {
+                          'error': [u'argument 1 `S` should be `d` '
+                                    u'at line 1, column 6 for bar'],
+                          'missingEntity': [u'foo']}}
+                      )
+                 ]}
+             }
+        )
+        mergefile = os.path.join(self.tmp, "merge", "l10n.properties")
+        self.assertTrue(os.path.isfile(mergefile))
+        p = getParser(mergefile)
+        p.readFile(mergefile)
+        [m, n] = p.parse()
+        self.assertEqual([e.key for e in m], ["eff", "foo", "bar"])
+        self.assertEqual(m[n['bar']].val, '%d barVal')
+
+    def testObsolete(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        self.reference("""foo = fooVal
+eff = effVal""")
+        self.localized("""foo = fooVal
+other = obsolete
+eff = leffVal
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.properties", ""),
+                   File(self.l10n, "l10n.properties", ""))
+        self.assertDictEqual(
+            cc.observer.toJSON(),
+            {'summary':
+                {None: {
+                    'changed': 1, 'obsolete': 1, 'unchanged': 1
+                }},
+             'details': {
+                 'children': [
+                     ('l10n.properties',
+                         {'value': {'obsoleteEntity': [u'other']}})]},
+             }
+        )
+
+
+class TestDTD(unittest.TestCase, ContentMixin):
+    extension = '.dtd'
+
+    def setUp(self):
+        self.tmp = mkdtemp()
+        os.mkdir(os.path.join(self.tmp, "merge"))
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp)
+        del self.tmp
+
+    def testGood(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        self.reference("""<!ENTITY foo 'fooVal'>
+<!ENTITY bar 'barVal'>
+<!ENTITY eff 'effVal'>""")
+        self.localized("""<!ENTITY foo 'lFoo'>
+<!ENTITY bar 'lBar'>
+<!ENTITY eff 'lEff'>
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.dtd", ""),
+                   File(self.l10n, "l10n.dtd", ""))
+        self.assertDictEqual(
+            cc.observer.toJSON(),
+            {'summary':
+                {None: {
+                    'changed': 3
+                }},
+             'details': {}
+             }
+        )
+        self.assert_(
+            not os.path.exists(os.path.join(cc.merge_stage, 'l10n.dtd')))
+
+    def testMissing(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        self.reference("""<!ENTITY foo 'fooVal'>
+<!ENTITY bar 'barVal'>
+<!ENTITY eff 'effVal'>""")
+        self.localized("""<!ENTITY bar 'lBar'>
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.dtd", ""),
+                   File(self.l10n, "l10n.dtd", ""))
+        self.assertDictEqual(
+            cc.observer.toJSON(),
+            {'summary':
+                {None: {
+                    'changed': 1, 'missing': 2
+                }},
+             'details': {
+                 'children': [
+                     ('l10n.dtd',
+                         {'value': {'missingEntity': [u'eff', u'foo']}}
+                      )
+                 ]}
+             }
+        )
+        mergefile = os.path.join(self.tmp, "merge", "l10n.dtd")
+        self.assertTrue(os.path.isfile(mergefile))
+        p = getParser(mergefile)
+        p.readFile(mergefile)
+        [m, n] = p.parse()
+        self.assertEqual(map(lambda e: e.key,  m), ["bar", "eff", "foo"])
+
+    def testJunk(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        self.reference("""<!ENTITY foo 'fooVal'>
+<!ENTITY bar 'barVal'>
+<!ENTITY eff 'effVal'>""")
+        self.localized("""<!ENTITY foo 'fooVal'>
+<!ENTY bar 'gimmick'>
+<!ENTITY eff 'effVal'>
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.dtd", ""),
+                   File(self.l10n, "l10n.dtd", ""))
+        self.assertDictEqual(
+            cc.observer.toJSON(),
+            {'summary':
+                {None: {
+                    'errors': 1, 'missing': 1, 'unchanged': 2
+                }},
+             'details': {
+                 'children': [
+                     ('l10n.dtd',
+                         {'value': {
+                             'error': [u'Unparsed content "<!ENTY bar '
+                                       u'\'gimmick\'>" at 23-44'],
+                             'missingEntity': [u'bar']}}
+                      )
+                 ]}
+             }
+        )
+        mergefile = os.path.join(self.tmp, "merge", "l10n.dtd")
+        self.assertTrue(os.path.isfile(mergefile))
+        p = getParser(mergefile)
+        p.readFile(mergefile)
+        [m, n] = p.parse()
+        self.assertEqual(map(lambda e: e.key,  m), ["foo", "eff", "bar"])
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/compare-locales/compare_locales/tests/test_properties.py b/python/compare-locales/compare_locales/tests/test_properties.py
new file mode 100644
index 000000000..331a1a57c
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_properties.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales.tests import ParserTestMixin
+
+
+class TestPropertiesParser(ParserTestMixin, unittest.TestCase):
+
+    filename = 'foo.properties'
+
+    def testBackslashes(self):
+        self._test(r'''one_line = This is one line
+two_line = This is the first \
+of two lines
+one_line_trailing = This line ends in \\
+and has junk
+two_lines_triple = This line is one of two and ends in \\\
+and still has another line coming
+''', (
+            ('one_line', 'This is one line'),
+            ('two_line', u'This is the first of two lines'),
+            ('one_line_trailing', u'This line ends in \\'),
+            ('_junk_\\d+_113-126$', 'and has junk\n'),
+            ('two_lines_triple', 'This line is one of two and ends in \\'
+             'and still has another line coming')))
+
+    def testProperties(self):
+        # port of netwerk/test/PropertiesTest.cpp
+        self.parser.readContents(self.resource('test.properties'))
+        ref = ['1', '2', '3', '4', '5', '6', '7', '8',
+               'this is the first part of a continued line '
+               'and here is the 2nd part']
+        i = iter(self.parser)
+        for r, e in zip(ref, i):
+            self.assertEqual(e.val, r)
+
+    def test_bug121341(self):
+        # port of xpcom/tests/unit/test_bug121341.js
+        self.parser.readContents(self.resource('bug121341.properties'))
+        ref = ['abc', 'xy', u"\u1234\t\r\n\u00AB\u0001\n",
+               "this is multiline property",
+               "this is another multiline property", u"test\u0036",
+               "yet another multiline propery", u"\ttest5\u0020", " test6\t",
+               u"c\uCDEFd", u"\uABCD"]
+        i = iter(self.parser)
+        for r, e in zip(ref, i):
+            self.assertEqual(e.val, r)
+
+    def test_comment_in_multi(self):
+        self._test(r'''bar=one line with a \
+# part that looks like a comment \
+and an end''', (('bar', 'one line with a # part that looks like a comment '
+                'and an end'),))
+
+    def test_license_header(self):
+        self._test('''\
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+foo=value
+''', (('foo', 'value'),))
+        self.assert_('MPL' in self.parser.header)
+
+    def test_escapes(self):
+        self.parser.readContents(r'''
+# unicode escapes
+zero = some \unicode
+one = \u0
+two = \u41
+three = \u042
+four = \u0043
+five = \u0044a
+six = \a
+seven = \n\r\t\\
+''')
+        ref = ['some unicode', chr(0), 'A', 'B', 'C', 'Da', 'a', '\n\r\t\\']
+        for r, e in zip(ref, self.parser):
+            self.assertEqual(e.val, r)
+
+    def test_trailing_comment(self):
+        self._test('''first = string
+second = string
+
+#
+#commented out
+''', (('first', 'string'), ('second', 'string')))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/compare-locales/compare_locales/tests/test_util.py b/python/compare-locales/compare_locales/tests/test_util.py
new file mode 100644
index 000000000..fd2d2c92b
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_util.py
@@ -0,0 +1,29 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales import util
+
+
+class ParseLocalesTest(unittest.TestCase):
+    def test_empty(self):
+        self.assertEquals(util.parseLocales(''), [])
+
+    def test_all(self):
+        self.assertEquals(util.parseLocales('''af
+de'''), ['af', 'de'])
+
+    def test_shipped(self):
+        self.assertEquals(util.parseLocales('''af
+ja win mac
+de'''), ['af', 'de', 'ja'])
+
+    def test_sparse(self):
+        self.assertEquals(util.parseLocales('''
+af
+
+de
+
+'''), ['af', 'de'])
diff --git a/python/compare-locales/compare_locales/tests/test_webapps.py b/python/compare-locales/compare_locales/tests/test_webapps.py
new file mode 100644
index 000000000..2f1223649
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_webapps.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales import webapps
+
+
+class TestFileComparison(unittest.TestCase):
+
+    def mock_FileComparison(self, mock_listdir):
+        class Target(webapps.FileComparison):
+            def _listdir(self):
+                return mock_listdir()
+        return Target('.', 'en-US')
+
+    def test_just_reference(self):
+        def _listdir():
+            return ['my_app.en-US.properties']
+        filecomp = self.mock_FileComparison(_listdir)
+        filecomp.files()
+        self.assertEqual(filecomp.locales(), [])
+        self.assertEqual(filecomp._reference.keys(), ['my_app'])
+        file_ = filecomp._reference['my_app']
+        self.assertEqual(file_.file, 'locales/my_app.en-US.properties')
+
+    def test_just_locales(self):
+        def _listdir():
+            return ['my_app.ar.properties',
+                    'my_app.sr-Latn.properties',
+                    'my_app.sv-SE.properties',
+                    'my_app.po_SI.properties']
+        filecomp = self.mock_FileComparison(_listdir)
+        filecomp.files()
+        self.assertEqual(filecomp.locales(),
+                         ['ar', 'sr-Latn', 'sv-SE'])
+        self.assertEqual(filecomp._files['ar'].keys(), ['my_app'])
+        file_ = filecomp._files['ar']['my_app']
+        self.assertEqual(file_.file, 'locales/my_app.ar.properties')
diff --git a/python/compare-locales/compare_locales/util.py b/python/compare-locales/compare_locales/util.py
new file mode 100644
index 000000000..71eadd874
--- /dev/null
+++ b/python/compare-locales/compare_locales/util.py
@@ -0,0 +1,11 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+# This file is shared between compare-locales and locale-inspector
+# test_util is in compare-locales only, for the sake of easy
+# development.
+
+
+def parseLocales(content):
+    return sorted(l.split()[0] for l in content.splitlines() if l)
diff --git a/python/compare-locales/compare_locales/webapps.py b/python/compare-locales/compare_locales/webapps.py
new file mode 100644
index 000000000..42f5b5657
--- /dev/null
+++ b/python/compare-locales/compare_locales/webapps.py
@@ -0,0 +1,235 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+'''gaia-style web apps support
+
+This variant supports manifest.webapp localization as well as
+.properties files with a naming scheme of locales/foo.*.properties.
+'''
+
+from collections import defaultdict
+import json
+import os
+import os.path
+import re
+
+from compare_locales.paths import File, EnumerateDir
+from compare_locales.compare import AddRemove, ContentComparer
+
+
+class WebAppCompare(object):
+    '''For a given directory, analyze
+    /manifest.webapp
+    /locales/*.*.properties
+
+    Deduce the present locale codes.
+    '''
+    ignore_dirs = EnumerateDir.ignore_dirs
+    reference_locale = 'en-US'
+
+    def __init__(self, basedir):
+        '''Constructor
+        :param basedir: Directory of the web app to inspect
+        '''
+        self.basedir = basedir
+        self.manifest = Manifest(basedir, self.reference_locale)
+        self.files = FileComparison(basedir, self.reference_locale)
+        self.watcher = None
+
+    def compare(self, locales):
+        '''Compare the manifest.webapp and the locales/*.*.properties
+        '''
+        if not locales:
+            locales = self.locales()
+        self.manifest.compare(locales)
+        self.files.compare(locales)
+
+    def setWatcher(self, watcher):
+        self.watcher = watcher
+        self.manifest.watcher = watcher
+        self.files.watcher = watcher
+
+    def locales(self):
+        '''Inspect files on disk to find present languages.
+        :rtype: List of locales, sorted, including reference.
+        '''
+        locales = set(self.manifest.strings.keys())
+        locales.update(self.files.locales())
+        locales = list(sorted(locales))
+        return locales
+
+
+class Manifest(object):
+    '''Class that helps with parsing and inspection of manifest.webapp.
+    '''
+
+    def __init__(self, basedir, reference_locale):
+        self.file = File(os.path.join(basedir, 'manifest.webapp'),
+                         'manifest.webapp')
+        self.reference_locale = reference_locale
+        self._strings = None
+        self.watcher = None
+
+    @property
+    def strings(self):
+        if self._strings is None:
+            self._strings = self.load_and_parse()
+        return self._strings
+
+    def load_and_parse(self):
+        try:
+            manifest = json.load(open(self.file.fullpath))
+        except (ValueError, IOError), e:
+            if self.watcher:
+                self.watcher.notify('error', self.file, str(e))
+            return False
+        return self.extract_manifest_strings(manifest)
+
+    def extract_manifest_strings(self, manifest_fragment):
+        '''Extract localizable strings from a manifest dict.
+        This method is recursive, and returns a two-level dict,
+        first level being locale codes, second level being generated
+        key and localized value. Keys are generated by concatenating
+        each level in the json with a ".".
+        '''
+        rv = defaultdict(dict)
+        localizable = manifest_fragment.pop('locales', {})
+        if localizable:
+            for locale, keyvalue in localizable.iteritems():
+                for key, value in keyvalue.iteritems():
+                    key = '.'.join(['locales', 'AB_CD', key])
+                    rv[locale][key] = value
+        for key, sub_manifest in manifest_fragment.iteritems():
+            if not isinstance(sub_manifest, dict):
+                continue
+            subdict = self.extract_manifest_strings(sub_manifest)
+            if subdict:
+                for locale, keyvalue in subdict:
+                    rv[locale].update((key + '.' + subkey, value)
+                                      for subkey, value
+                                      in keyvalue.iteritems())
+        return rv
+
+    def compare(self, locales):
+        strings = self.strings
+        if not strings:
+            return
+        # create a copy so that we can mock around with it
+        strings = strings.copy()
+        reference = strings.pop(self.reference_locale)
+        for locale in locales:
+            if locale == self.reference_locale:
+                continue
+            self.compare_strings(reference,
+                                 strings.get(locale, {}),
+                                 locale)
+
+    def compare_strings(self, reference, l10n, locale):
+        add_remove = AddRemove()
+        add_remove.set_left(sorted(reference.keys()))
+        add_remove.set_right(sorted(l10n.keys()))
+        missing = obsolete = changed = unchanged = 0
+        for op, item_or_pair in add_remove:
+            if op == 'equal':
+                if reference[item_or_pair[0]] == l10n[item_or_pair[1]]:
+                    unchanged += 1
+                else:
+                    changed += 1
+            else:
+                key = item_or_pair.replace('.AB_CD.',
+                                           '.%s.' % locale)
+                if op == 'add':
+                    # obsolete entry
+                    obsolete += 1
+                    self.watcher.notify('obsoleteEntity', self.file, key)
+                else:
+                    # missing entry
+                    missing += 1
+                    self.watcher.notify('missingEntity', self.file, key)
+
+
+class FileComparison(object):
+    '''Compare the locales/*.*.properties files inside a webapp.
+    '''
+    prop = re.compile('(?P<base>.*)\\.'
+                      '(?P<locale>[a-zA-Z]+(?:-[a-zA-Z]+)*)'
+                      '\\.properties$')
+
+    def __init__(self, basedir, reference_locale):
+        self.basedir = basedir
+        self.reference_locale = reference_locale
+        self.watcher = None
+        self._reference = self._files = None
+
+    def locales(self):
+        '''Get the locales present in the webapp
+        '''
+        self.files()
+        locales = self._files.keys()
+        locales.sort()
+        return locales
+
+    def compare(self, locales):
+        self.files()
+        for locale in locales:
+            l10n = self._files[locale]
+            filecmp = AddRemove()
+            filecmp.set_left(sorted(self._reference.keys()))
+            filecmp.set_right(sorted(l10n.keys()))
+            for op, item_or_pair in filecmp:
+                if op == 'equal':
+                    self.watcher.compare(self._reference[item_or_pair[0]],
+                                         l10n[item_or_pair[1]])
+                elif op == 'add':
+                    # obsolete file
+                    self.watcher.remove(l10n[item_or_pair])
+                else:
+                    # missing file
+                    _path = '.'.join([item_or_pair, locale, 'properties'])
+                    missingFile = File(
+                        os.path.join(self.basedir, 'locales', _path),
+                        'locales/' + _path)
+                    self.watcher.add(self._reference[item_or_pair],
+                                     missingFile)
+
+    def files(self):
+        '''Read the list of locales from disk.
+        '''
+        if self._reference:
+            return
+        self._reference = {}
+        self._files = defaultdict(dict)
+        path_list = self._listdir()
+        for path in path_list:
+            match = self.prop.match(path)
+            if match is None:
+                continue
+            locale = match.group('locale')
+            if locale == self.reference_locale:
+                target = self._reference
+            else:
+                target = self._files[locale]
+            fullpath = os.path.join(self.basedir, 'locales', path)
+            target[match.group('base')] = File(fullpath, 'locales/' + path)
+
+    def _listdir(self):
+        'Monkey-patch this for testing.'
+        return os.listdir(os.path.join(self.basedir, 'locales'))
+
+
+def compare_web_app(basedir, locales, other_observer=None):
+    '''Compare gaia-style web app.
+
+    Optional arguments are:
+    - other_observer. A object implementing
+        notify(category, _file, data)
+      The return values of that callback are ignored.
+    '''
+    comparer = ContentComparer()
+    if other_observer is not None:
+        comparer.add_observer(other_observer)
+    webapp_comp = WebAppCompare(basedir)
+    webapp_comp.setWatcher(comparer)
+    webapp_comp.compare(locales)
+    return comparer.observer
diff --git a/python/compare-locales/docs/glossary.rst b/python/compare-locales/docs/glossary.rst
new file mode 100644
index 000000000..e89839b16
--- /dev/null
+++ b/python/compare-locales/docs/glossary.rst
@@ -0,0 +1,26 @@
+========
+Glossary
+========
+
+.. glossary::
+    :sorted:
+
+    Localization
+        The process of creating content in a native language, including
+        translation, but also customizations like Search.
+
+    Localizability
+        Enabling a piece of software to be localized. This is mostly
+        externalizing English strings, and writing build support to 
+        pick up localized search engines etc.
+
+    L10n
+        *Numeronym* for Localization, *L*, 10 chars, *n*
+
+    L12y
+        Numeronym for Localizability
+
+    l10n-merge
+        nick-name for the process of merging ``en-US`` and a particular
+        localization into one joint artifact without any missing strings, and
+        without technical errors, as far as possible.
diff --git a/python/compare-locales/docs/index.rst b/python/compare-locales/docs/index.rst
new file mode 100644
index 000000000..925ca0f88
--- /dev/null
+++ b/python/compare-locales/docs/index.rst
@@ -0,0 +1,191 @@
+============
+Localization
+============
+
+.. toctree::
+   :maxdepth: 1
+
+   glossary
+
+The documentation here is targeted at developers, writing localizable code
+for Firefox and Firefox for Android, as well as Thunderbird and SeaMonkey.
+
+If you haven't dealt with localization in gecko code before, it's a good
+idea to check the :doc:`./glossary` for what localization is, and which terms
+we use for what.
+
+Exposing strings
+----------------
+
+Localizers only handle a few file formats in well-known locations in the
+source tree.
+
+The locations are in directories like
+
+    :file:`browser/`\ ``locales/en-US/``\ :file:`subdir/file.ext`
+
+The first thing to note is that only files beneath :file:`locales/en-US` are
+exposed to localizers. The second thing to note is that only a few directories
+are exposed. Which directories are exposed is defined in files called
+``l10n.ini``, which are at a
+`few places <https://dxr.mozilla.org/mozilla-central/search?q=path%3Al10n.ini&redirect=true>`_
+in the source code.
+
+An example looks like this
+
+.. code-block:: ini
+
+    [general]
+    depth = ../..
+
+    [compare]
+    dirs = browser
+        browser/branding/official
+
+    [includes]
+    toolkit = toolkit/locales/l10n.ini
+
+This tells the l10n infrastructure three things: Resolve the paths against the
+directory two levels up, include files in :file:`browser/locales/en-US` and
+:file:`browser/branding/official/locales/en-US`, and load more data from
+:file:`toolkit/locales/l10n.ini`.
+
+For projects like Thunderbird and SeaMonkey in ``comm-central``, additional
+data needs to be provided when including an ``l10n.ini`` from a different
+repository:
+
+.. code-block:: ini
+
+    [include_toolkit]
+    type = hg
+    mozilla = mozilla-central
+    repo = http://hg.mozilla.org/
+    l10n.ini = toolkit/locales/l10n.ini
+
+This tells the l10n pieces where to find the repository, and where inside
+that repository the ``l10n.ini`` file is. This is needed because for local
+builds, :file:`mail/locales/l10n.ini` references
+:file:`mozilla/toolkit/locales/l10n.ini`, which is where the comm-central
+build setup expects toolkit to be.
+
+Now that the directories exposed to l10n are known, we can talk about the
+supported file formats.
+
+File formats
+------------
+
+This is just a quick overview, please check the
+`XUL Tutorial <https://developer.mozilla.org/docs/Mozilla/Tech/XUL/Tutorial/Localization>`_
+for an in-depth tour.
+
+The following file formats are known to the l10n tool chains:
+
+DTD
+    Used in XUL and XHTML. Also for Android native strings.
+Properties
+    Used from JavaScript and C++. When used from js, also comes with
+    `plural support <https://developer.mozilla.org/docs/Mozilla/Localization/Localization_and_Plurals>`_.
+ini
+    Used by the crashreporter and updater, avoid if possible.
+foo.defines
+    Used during builds, for example to create file:`install.rdf` for
+    language packs.
+
+Adding new formats involves changing various different tools, and is strongly
+discouraged.
+
+Exceptions
+----------
+Generally, anything that exists in ``en-US`` needs a one-to-one mapping in
+all localizations. There are a few cases where that's not wanted, notably
+around search settings and spell-checking dictionaries.
+
+To enable tools to adjust to those exceptions, there's a python-coded
+:py:mod:`filter.py`, implementing :py:func:`test`, with the following
+signature
+
+.. code-block:: python
+
+    def test(mod, path, entity = None):
+        if does_not_matter:
+            return "ignore"
+        if show_but_do_not_merge:
+            return "report"
+        # default behavior, localizer or build need to do something
+        return "error"
+
+For any missing file, this function is called with ``mod`` being
+the *module*, and ``path`` being the relative path inside
+:file:`locales/en-US`. The module is the top-level dir as referenced in
+:file:`l10n.ini`.
+
+For missing strings, the :py:data:`entity` parameter is the key of the string
+in the en-US file.
+
+l10n-merge
+----------
+
+Gecko doesn't support fallback from a localization to ``en-US`` at runtime.
+Thus, the build needs to ensure that the localization as it's built into
+the package has all required strings, and that the strings don't contain
+errors. To ensure that, we're *merging* the localization and ``en-US``
+at build time, nick-named :term:`l10n-merge`.
+
+The process is usually triggered via
+
+.. code-block:: bash
+
+    $obj-dir/browser/locales> make merge-de LOCALE_MERGEDIR=$PWD/merge-de
+
+It creates another directory in the object dir, :file:`merge-ab-CD`, in
+which the modified files are stored. The actual repackaging process looks for
+the localized files in the merge dir first, then the localized file, and then
+in ``en-US``. Thus, for the ``de`` localization of
+:file:`browser/locales/en-US/chrome/browser/browser.dtd`, it checks
+
+1. :file:`$objdir/browser/locales/merge-de/browser/chrome/browser/browser.dtd`
+2. :file:`$(LOCALE_BASEDIR)/de/browser/chrome/browser/browser.dtd`
+3. :file:`browser/locales/en-US/chrome/browser/browser.dtd`
+
+and will include the first of those files it finds.
+
+l10n-merge modifies a file if it supports the particular file type, and there
+are missing strings which are not filtered out, or if an existing string
+shows an error. See the Checks section below for details.
+
+Checks
+------
+
+As part of the build and other localization tool chains, we run a variety
+of source-based checks. Think of them as linters.
+
+The suite of checks is usually determined by file type, i.e., there's a
+suite of checks for DTD files and one for properties files, etc. An exception
+are Android-specific checks.
+
+Android
+^^^^^^^
+
+For Android, we need to localize :file:`strings.xml`. We're doing so via DTD
+files, which is mostly OK. But the strings inside the XML file have to
+satisfy additional constraints about quotes etc, that are not part of XML.
+There's probably some historic background on why things are the way they are.
+
+The Android-specific checks are enabled for DTD files that are in
+:file:`mobile/android/base/locales/en-US/`.
+
+Localizations
+-------------
+
+Now that we talked in-depth about how to expose content to localizers,
+where are the localizations?
+
+We host a mercurial repository per locale and per branch. Most of our
+localizations only work starting with aurora, so the bulk of the localizations
+is found on https://hg.mozilla.org/releases/l10n/mozilla-aurora/. We have
+several localizations continuously working with mozilla-central, those
+repositories are on https://hg.mozilla.org/l10n-central/.
+
+You can search inside our localized files on
+`Transvision <https://transvision.mozfr.org/>`_ and
+http://dxr.mozilla.org/l10n-mozilla-aurora/.
diff --git a/python/compare-locales/mach_commands.py b/python/compare-locales/mach_commands.py
new file mode 100644
index 000000000..7be6a50e7
--- /dev/null
+++ b/python/compare-locales/mach_commands.py
@@ -0,0 +1,81 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this,
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from __future__ import absolute_import, print_function, unicode_literals
+
+from mach.decorators import (
+    CommandArgument,
+    CommandProvider,
+    Command,
+)
+
+from mozbuild.base import (
+    MachCommandBase,
+)
+
+import mozpack.path as mozpath
+
+
+MERGE_HELP = '''Directory to merge to. Will be removed to before running
+the comparison. Default: $(OBJDIR)/($MOZ_BUILD_APP)/locales/merge-$(AB_CD)
+'''.lstrip()
+
+
+@CommandProvider
+class CompareLocales(MachCommandBase):
+    """Run compare-locales."""
+
+    @Command('compare-locales', category='testing',
+             description='Run source checks on a localization.')
+    @CommandArgument('--l10n-ini',
+                     help='l10n.ini describing the app. ' +
+                     'Default: $(MOZ_BUILD_APP)/locales/l10n.ini')
+    @CommandArgument('--l10n-base',
+                     help='Directory with the localizations. ' +
+                     'Default: $(L10NBASEDIR)')
+    @CommandArgument('--merge-dir',
+                     help=MERGE_HELP)
+    @CommandArgument('locales', nargs='+', metavar='ab_CD',
+                     help='Locale codes to compare')
+    def compare(self, l10n_ini=None, l10n_base=None, merge_dir=None,
+                locales=None):
+        from compare_locales.paths import EnumerateApp
+        from compare_locales.compare import compareApp
+
+        # check if we're configured and use defaults from there
+        # otherwise, error early
+        try:
+            self.substs  # explicitly check
+            if not l10n_ini:
+                l10n_ini = mozpath.join(
+                    self.topsrcdir,
+                    self.substs['MOZ_BUILD_APP'],
+                    'locales', 'l10n.ini'
+                )
+            if not l10n_base:
+                l10n_base = mozpath.join(
+                    self.topsrcdir,
+                    self.substs['L10NBASEDIR']
+                )
+        except Exception:
+            if not l10n_ini or not l10n_base:
+                print('Specify --l10n-ini and --l10n-base or run configure.')
+                return 1
+
+        if not merge_dir:
+            try:
+                # self.substs is raising an Exception if we're not configured
+                # don't merge if we're not
+                merge_dir = mozpath.join(
+                    self.topobjdir,
+                    self.substs['MOZ_BUILD_APP'],
+                    'locales', 'merge-dir-{ab_CD}'
+                )
+            except Exception:
+                pass
+
+        app = EnumerateApp(l10n_ini, l10n_base, locales)
+        observer = compareApp(app, merge_stage=merge_dir,
+                              clobber=True)
+        print(observer.serialize())
diff --git a/python/compare-locales/moz.build b/python/compare-locales/moz.build
new file mode 100644
index 000000000..f772ab620
--- /dev/null
+++ b/python/compare-locales/moz.build
@@ -0,0 +1,16 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+with Files('compare_locales/**'):
+    BUG_COMPONENT = ('Localization Infrastructure and Tools', 'compare-locales')
+with Files('docs/**'):
+    BUG_COMPONENT = ('Mozilla Localizations', 'Documentation')
+
+# SPHINX_PYTHON_PACKAGE_DIRS += [
+#     'compare_locales',
+# ]
+
+SPHINX_TREES['.'] = 'docs'