From 5f8de423f190bbb79a62f804151bc24824fa32d8 Mon Sep 17 00:00:00 2001 From: "Matt A. Tobin" Date: Fri, 2 Feb 2018 04:16:08 -0500 Subject: Add m-esr52 at 52.6.0 --- python/compare-locales/compare_locales/__init__.py | 1 + python/compare-locales/compare_locales/checks.py | 438 ++++++++++++++ python/compare-locales/compare_locales/commands.py | 154 +++++ python/compare-locales/compare_locales/compare.py | 638 +++++++++++++++++++++ python/compare-locales/compare_locales/parser.py | 521 +++++++++++++++++ python/compare-locales/compare_locales/paths.py | 398 +++++++++++++ .../compare_locales/tests/__init__.py | 49 ++ .../tests/data/bug121341.properties | 68 +++ .../compare_locales/tests/data/test.properties | 14 + .../compare_locales/tests/data/triple-license.dtd | 38 ++ .../compare_locales/tests/test_checks.py | 403 +++++++++++++ .../compare_locales/tests/test_compare.py | 90 +++ .../compare_locales/tests/test_dtd.py | 86 +++ .../compare_locales/tests/test_ini.py | 115 ++++ .../compare_locales/tests/test_merge.py | 265 +++++++++ .../compare_locales/tests/test_properties.py | 95 +++ .../compare_locales/tests/test_util.py | 29 + .../compare_locales/tests/test_webapps.py | 41 ++ python/compare-locales/compare_locales/util.py | 11 + python/compare-locales/compare_locales/webapps.py | 235 ++++++++ 20 files changed, 3689 insertions(+) create mode 100644 python/compare-locales/compare_locales/__init__.py create mode 100644 python/compare-locales/compare_locales/checks.py create mode 100644 python/compare-locales/compare_locales/commands.py create mode 100644 python/compare-locales/compare_locales/compare.py create mode 100644 python/compare-locales/compare_locales/parser.py create mode 100644 python/compare-locales/compare_locales/paths.py create mode 100644 python/compare-locales/compare_locales/tests/__init__.py create mode 100644 python/compare-locales/compare_locales/tests/data/bug121341.properties create mode 100644 python/compare-locales/compare_locales/tests/data/test.properties create mode 100644 python/compare-locales/compare_locales/tests/data/triple-license.dtd create mode 100644 python/compare-locales/compare_locales/tests/test_checks.py create mode 100644 python/compare-locales/compare_locales/tests/test_compare.py create mode 100644 python/compare-locales/compare_locales/tests/test_dtd.py create mode 100644 python/compare-locales/compare_locales/tests/test_ini.py create mode 100644 python/compare-locales/compare_locales/tests/test_merge.py create mode 100644 python/compare-locales/compare_locales/tests/test_properties.py create mode 100644 python/compare-locales/compare_locales/tests/test_util.py create mode 100644 python/compare-locales/compare_locales/tests/test_webapps.py create mode 100644 python/compare-locales/compare_locales/util.py create mode 100644 python/compare-locales/compare_locales/webapps.py (limited to 'python/compare-locales/compare_locales') diff --git a/python/compare-locales/compare_locales/__init__.py b/python/compare-locales/compare_locales/__init__.py new file mode 100644 index 000000000..bad265e4f --- /dev/null +++ b/python/compare-locales/compare_locales/__init__.py @@ -0,0 +1 @@ +version = "1.1" diff --git a/python/compare-locales/compare_locales/checks.py b/python/compare-locales/compare_locales/checks.py new file mode 100644 index 000000000..ee3bef03d --- /dev/null +++ b/python/compare-locales/compare_locales/checks.py @@ -0,0 +1,438 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import re +from difflib import SequenceMatcher +from xml import sax +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + +from compare_locales.parser import DTDParser, PropertiesParser + + +class Checker(object): + '''Abstract class to implement checks per file type. + ''' + pattern = None + + @classmethod + def use(cls, file): + return cls.pattern.match(file.file) + + def check(self, refEnt, l10nEnt): + '''Given the reference and localized Entities, performs checks. + + This is a generator yielding tuples of + - "warning" or "error", depending on what should be reported, + - tuple of line, column info for the error within the string + - description string to be shown in the report + ''' + if True: + raise NotImplementedError("Need to subclass") + yield ("error", (0, 0), "This is an example error", "example") + + +class PrintfException(Exception): + def __init__(self, msg, pos): + self.pos = pos + self.msg = msg + + +class PropertiesChecker(Checker): + '''Tests to run on .properties files. + ''' + pattern = re.compile('.*\.properties$') + printf = re.compile(r'%(?P%|' + r'(?:(?P[1-9][0-9]*)\$)?' + r'(?P\*|[0-9]+)?' + r'(?P\.(?:\*|[0-9]+)?)?' + r'(?P[duxXosScpfg]))?') + + def check(self, refEnt, l10nEnt): + '''Test for the different variable formats. + ''' + refValue, l10nValue = refEnt.val, l10nEnt.val + refSpecs = None + # check for PluralForm.jsm stuff, should have the docs in the + # comment + if 'Localization_and_Plurals' in refEnt.pre_comment: + # For plurals, common variable pattern is #1. Try that. + pats = set(int(m.group(1)) for m in re.finditer('#([0-9]+)', + refValue)) + if len(pats) == 0: + return + lpats = set(int(m.group(1)) for m in re.finditer('#([0-9]+)', + l10nValue)) + if pats - lpats: + yield ('warning', 0, 'not all variables used in l10n', + 'plural') + return + if lpats - pats: + yield ('error', 0, 'unreplaced variables in l10n', + 'plural') + return + return + # check for lost escapes + raw_val = l10nEnt.raw_val + for m in PropertiesParser.escape.finditer(raw_val): + if m.group('single') and \ + m.group('single') not in PropertiesParser.known_escapes: + yield ('warning', m.start(), + 'unknown escape sequence, \\' + m.group('single'), + 'escape') + try: + refSpecs = self.getPrintfSpecs(refValue) + except PrintfException: + refSpecs = [] + if refSpecs: + for t in self.checkPrintf(refSpecs, l10nValue): + yield t + return + + def checkPrintf(self, refSpecs, l10nValue): + try: + l10nSpecs = self.getPrintfSpecs(l10nValue) + except PrintfException, e: + yield ('error', e.pos, e.msg, 'printf') + return + if refSpecs != l10nSpecs: + sm = SequenceMatcher() + sm.set_seqs(refSpecs, l10nSpecs) + msgs = [] + warn = None + for action, i1, i2, j1, j2 in sm.get_opcodes(): + if action == 'equal': + continue + if action == 'delete': + # missing argument in l10n + if i2 == len(refSpecs): + # trailing specs missing, that's just a warning + warn = ', '.join('trailing argument %d `%s` missing' % + (i+1, refSpecs[i]) + for i in xrange(i1, i2)) + else: + for i in xrange(i1, i2): + msgs.append('argument %d `%s` missing' % + (i+1, refSpecs[i])) + continue + if action == 'insert': + # obsolete argument in l10n + for i in xrange(j1, j2): + msgs.append('argument %d `%s` obsolete' % + (i+1, l10nSpecs[i])) + continue + if action == 'replace': + for i, j in zip(xrange(i1, i2), xrange(j1, j2)): + msgs.append('argument %d `%s` should be `%s`' % + (j+1, l10nSpecs[j], refSpecs[i])) + if msgs: + yield ('error', 0, ', '.join(msgs), 'printf') + if warn is not None: + yield ('warning', 0, warn, 'printf') + + def getPrintfSpecs(self, val): + hasNumber = False + specs = [] + for m in self.printf.finditer(val): + if m.group("good") is None: + # found just a '%', signal an error + raise PrintfException('Found single %', m.start()) + if m.group("good") == '%': + # escaped % + continue + if ((hasNumber and m.group('number') is None) or + (not hasNumber and specs and + m.group('number') is not None)): + # mixed style, numbered and not + raise PrintfException('Mixed ordered and non-ordered args', + m.start()) + hasNumber = m.group('number') is not None + if hasNumber: + pos = int(m.group('number')) - 1 + ls = len(specs) + if pos >= ls: + # pad specs + nones = pos - ls + specs[ls:pos] = nones*[None] + specs.append(m.group('spec')) + else: + if specs[pos] is not None: + raise PrintfException('Double ordered argument %d' % + (pos+1), + m.start()) + specs[pos] = m.group('spec') + else: + specs.append(m.group('spec')) + # check for missing args + if hasNumber and not all(specs): + raise PrintfException('Ordered argument missing', 0) + return specs + + +class DTDChecker(Checker): + """Tests to run on DTD files. + + Uses xml.sax for the heavy lifting of xml parsing. + + The code tries to parse until it doesn't find any unresolved entities + anymore. If it finds one, it tries to grab the key, and adds an empty + definition to the header. + + Also checks for some CSS and number heuristics in the values. + """ + pattern = re.compile('.*\.dtd$') + + eref = re.compile('&(%s);' % DTDParser.Name) + tmpl = ''' +%s +''' + xmllist = set(('amp', 'lt', 'gt', 'apos', 'quot')) + + def __init__(self, reference): + self.reference = reference + self.__known_entities = None + + def known_entities(self, refValue): + if self.__known_entities is None and self.reference is not None: + self.__known_entities = set() + for ent in self.reference: + self.__known_entities.update(self.entities_for_value(ent.val)) + return self.__known_entities if self.__known_entities is not None \ + else self.entities_for_value(refValue) + + def entities_for_value(self, value): + reflist = set(m.group(1).encode('utf-8') + for m in self.eref.finditer(value)) + reflist -= self.xmllist + return reflist + + # Setup for XML parser, with default and text-only content handler + class TextContent(sax.handler.ContentHandler): + textcontent = '' + + def characters(self, content): + self.textcontent += content + + defaulthandler = sax.handler.ContentHandler() + texthandler = TextContent() + + numPattern = r'([0-9]+|[0-9]*\.[0-9]+)' + num = re.compile('^%s$' % numPattern) + lengthPattern = '%s(em|px|ch|cm|in)' % numPattern + length = re.compile('^%s$' % lengthPattern) + spec = re.compile(r'((?:min\-)?(?:width|height))\s*:\s*%s' % + lengthPattern) + style = re.compile(r'^%(spec)s\s*(;\s*%(spec)s\s*)*;?$' % + {'spec': spec.pattern}) + + processContent = None + + def check(self, refEnt, l10nEnt): + """Try to parse the refvalue inside a dummy element, and keep + track of entities that we need to define to make that work. + + Return a checker that offers just those entities. + """ + refValue, l10nValue = refEnt.val, l10nEnt.val + # find entities the refValue references, + # reusing markup from DTDParser. + reflist = self.known_entities(refValue) + inContext = self.entities_for_value(refValue) + entities = ''.join('' % s for s in sorted(reflist)) + parser = sax.make_parser() + parser.setFeature(sax.handler.feature_external_ges, False) + + parser.setContentHandler(self.defaulthandler) + try: + parser.parse(StringIO(self.tmpl % + (entities, refValue.encode('utf-8')))) + # also catch stray % + parser.parse(StringIO(self.tmpl % + (refEnt.all.encode('utf-8') + entities, + '&%s;' % refEnt.key.encode('utf-8')))) + except sax.SAXParseException, e: + yield ('warning', + (0, 0), + "can't parse en-US value", 'xmlparse') + + # find entities the l10nValue references, + # reusing markup from DTDParser. + l10nlist = self.entities_for_value(l10nValue) + missing = sorted(l10nlist - reflist) + _entities = entities + ''.join('' % s for s in missing) + if self.processContent is not None: + self.texthandler.textcontent = '' + parser.setContentHandler(self.texthandler) + try: + parser.parse(StringIO(self.tmpl % (_entities, + l10nValue.encode('utf-8')))) + # also catch stray % + # if this fails, we need to substract the entity definition + parser.setContentHandler(self.defaulthandler) + parser.parse(StringIO(self.tmpl % ( + l10nEnt.all.encode('utf-8') + _entities, + '&%s;' % l10nEnt.key.encode('utf-8')))) + except sax.SAXParseException, e: + # xml parse error, yield error + # sometimes, the error is reported on our fake closing + # element, make that the end of the last line + lnr = e.getLineNumber() - 1 + lines = l10nValue.splitlines() + if lnr > len(lines): + lnr = len(lines) + col = len(lines[lnr-1]) + else: + col = e.getColumnNumber() + if lnr == 1: + # first line starts with , substract + col -= len("") + elif lnr == 0: + col -= len("[\"']).*(?P=q)$") + + def unicode_escape(self, str): + """Helper method to try to decode all unicode escapes in a string. + + This code uses the standard python decode for unicode-escape, but + that's somewhat tricky, as its input needs to be ascii. To get to + ascii, the unicode string gets converted to ascii with + backslashreplace, i.e., all non-ascii unicode chars get unicode + escaped. And then we try to roll all of that back. + Now, when that hits an error, that's from the original string, and we + need to search for the actual error position in the original string, + as the backslashreplace code changes string positions quite badly. + See also the last check in TestAndroid.test_android_dtd, with a + lengthy chinese string. + """ + val = str.encode('ascii', 'backslashreplace') + try: + val.decode('unicode-escape') + except UnicodeDecodeError, e: + args = list(e.args) + badstring = args[1][args[2]:args[3]] + i = len(args[1][:args[2]].decode('unicode-escape')) + args[2] = i + args[3] = i + len(badstring) + raise UnicodeDecodeError(*args) + + @classmethod + def use(cls, file): + """Use this Checker only for DTD files in embedding/android.""" + return (file.module in ("embedding/android", + "mobile/android/base") and + cls.pattern.match(file.file)) + + def processContent(self, val): + """Actual check code. + Check for unicode escapes and unescaped quotes and apostrophes, + if string's not quoted. + """ + # first, try to decode unicode escapes + try: + self.unicode_escape(val) + except UnicodeDecodeError, e: + yield ('error', e.args[2], e.args[4], 'android') + # check for unescaped single or double quotes. + # first, see if the complete string is single or double quoted, + # that changes the rules + m = self.quoted.match(val) + if m: + q = m.group('q') + offset = 0 + val = val[1:-1] # strip quotes + else: + q = "[\"']" + offset = -1 + stray_quot = re.compile(r"[\\\\]*(%s)" % q) + + for m in stray_quot.finditer(val): + if len(m.group(0)) % 2: + # found an unescaped single or double quote, which message? + if m.group(1) == '"': + msg = u"Quotes in Android DTDs need escaping with \\\" "\ + u"or \\u0022, or put string in apostrophes." + else: + msg = u"Apostrophes in Android DTDs need escaping with "\ + u"\\' or \\u0027, or use \u2019, or put string in "\ + u"quotes." + yield ('error', m.end(0)+offset, msg, 'android') + + +def getChecker(file, reference=None): + if PropertiesChecker.use(file): + return PropertiesChecker() + if PrincessAndroid.use(file): + return PrincessAndroid(reference) + if DTDChecker.use(file): + return DTDChecker(reference) + return None diff --git a/python/compare-locales/compare_locales/commands.py b/python/compare-locales/compare_locales/commands.py new file mode 100644 index 000000000..61b58ec4b --- /dev/null +++ b/python/compare-locales/compare_locales/commands.py @@ -0,0 +1,154 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +'Commands exposed to commandlines' + +import logging +from optparse import OptionParser, make_option + +from compare_locales.paths import EnumerateApp +from compare_locales.compare import compareApp, compareDirs +from compare_locales.webapps import compare_web_app + + +class BaseCommand(object): + """Base class for compare-locales commands. + This handles command line parsing, and general sugar for setuptools + entry_points. + """ + options = [ + make_option('-v', '--verbose', action='count', dest='v', default=0, + help='Make more noise'), + make_option('-q', '--quiet', action='count', dest='q', default=0, + help='Make less noise'), + make_option('-m', '--merge', + help='''Use this directory to stage merged files, +use {ab_CD} to specify a different directory for each locale'''), + ] + data_option = make_option('--data', choices=['text', 'exhibit', 'json'], + default='text', + help='''Choose data and format (one of text, +exhibit, json); text: (default) Show which files miss which strings, together +with warnings and errors. Also prints a summary; json: Serialize the internal +tree, useful for tools. Also always succeeds; exhibit: Serialize the summary +data in a json useful for Exhibit +''') + + def __init__(self): + self.parser = None + + def get_parser(self): + """Get an OptionParser, with class docstring as usage, and + self.options. + """ + parser = OptionParser() + parser.set_usage(self.__doc__) + for option in self.options: + parser.add_option(option) + return parser + + @classmethod + def call(cls): + """Entry_point for setuptools. + The actual command handling is done in the handle() method of the + subclasses. + """ + cmd = cls() + cmd.handle_() + + def handle_(self): + """The instance part of the classmethod call.""" + self.parser = self.get_parser() + (options, args) = self.parser.parse_args() + # log as verbose or quiet as we want, warn by default + logging.basicConfig() + logging.getLogger().setLevel(logging.WARNING - + (options.v - options.q)*10) + observer = self.handle(args, options) + print observer.serialize(type=options.data).encode('utf-8', 'replace') + + def handle(self, args, options): + """Subclasses need to implement this method for the actual + command handling. + """ + raise NotImplementedError + + +class CompareLocales(BaseCommand): + """usage: %prog [options] l10n.ini l10n_base_dir [locale ...] + +Check the localization status of a gecko application. +The first argument is a path to the l10n.ini file for the application, +followed by the base directory of the localization repositories. +Then you pass in the list of locale codes you want to compare. If there are +not locales given, the list of locales will be taken from the all-locales file +of the application\'s l10n.ini.""" + + options = BaseCommand.options + [ + make_option('--clobber-merge', action="store_true", default=False, + dest='clobber', + help="""WARNING: DATALOSS. +Use this option with care. If specified, the merge directory will +be clobbered for each module. That means, the subdirectory will +be completely removed, any files that were there are lost. +Be careful to specify the right merge directory when using this option."""), + make_option('-r', '--reference', default='en-US', dest='reference', + help='Explicitly set the reference ' + 'localization. [default: en-US]'), + BaseCommand.data_option + ] + + def handle(self, args, options): + if len(args) < 2: + self.parser.error('Need to pass in list of languages') + inipath, l10nbase = args[:2] + locales = args[2:] + app = EnumerateApp(inipath, l10nbase, locales) + app.reference = options.reference + try: + observer = compareApp(app, merge_stage=options.merge, + clobber=options.clobber) + except (OSError, IOError), exc: + print "FAIL: " + str(exc) + self.parser.exit(2) + return observer + + +class CompareDirs(BaseCommand): + """usage: %prog [options] reference localization + +Check the localization status of a directory tree. +The first argument is a path to the reference data,the second is the +localization to be tested.""" + + options = BaseCommand.options + [ + BaseCommand.data_option + ] + + def handle(self, args, options): + if len(args) != 2: + self.parser.error('Reference and localizatino required') + reference, locale = args + observer = compareDirs(reference, locale, merge_stage=options.merge) + return observer + + +class CompareWebApp(BaseCommand): + """usage: %prog [options] webapp [locale locale] + +Check the localization status of a gaia-style web app. +The first argument is the directory of the web app. +Following arguments explicitly state the locales to test. +If none are given, test all locales in manifest.webapp or files.""" + + options = BaseCommand.options[:-1] + [ + BaseCommand.data_option] + + def handle(self, args, options): + if len(args) < 1: + self.parser.error('Webapp directory required') + basedir = args[0] + locales = args[1:] + observer = compare_web_app(basedir, locales) + return observer diff --git a/python/compare-locales/compare_locales/compare.py b/python/compare-locales/compare_locales/compare.py new file mode 100644 index 000000000..4f71c46f8 --- /dev/null +++ b/python/compare-locales/compare_locales/compare.py @@ -0,0 +1,638 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +'Mozilla l10n compare locales tool' + +import codecs +import os +import os.path +import shutil +import re +from difflib import SequenceMatcher +from collections import defaultdict + +try: + from json import dumps +except: + from simplejson import dumps + +from compare_locales import parser +from compare_locales import paths +from compare_locales.checks import getChecker + + +class Tree(object): + def __init__(self, valuetype): + self.branches = dict() + self.valuetype = valuetype + self.value = None + + def __getitem__(self, leaf): + parts = [] + if isinstance(leaf, paths.File): + parts = [p for p in [leaf.locale, leaf.module] if p] + \ + leaf.file.split('/') + else: + parts = leaf.split('/') + return self.__get(parts) + + def __get(self, parts): + common = None + old = None + new = tuple(parts) + t = self + for k, v in self.branches.iteritems(): + for i, part in enumerate(zip(k, parts)): + if part[0] != part[1]: + i -= 1 + break + if i < 0: + continue + i += 1 + common = tuple(k[:i]) + old = tuple(k[i:]) + new = tuple(parts[i:]) + break + if old: + self.branches.pop(k) + t = Tree(self.valuetype) + t.branches[old] = v + self.branches[common] = t + elif common: + t = self.branches[common] + if new: + if common: + return t.__get(new) + t2 = t + t = Tree(self.valuetype) + t2.branches[new] = t + if t.value is None: + t.value = t.valuetype() + return t.value + + indent = ' ' + + def getContent(self, depth=0): + ''' + Returns iterator of (depth, flag, key_or_value) tuples. + If flag is 'value', key_or_value is a value object, otherwise + (flag is 'key') it's a key string. + ''' + keys = self.branches.keys() + keys.sort() + if self.value is not None: + yield (depth, 'value', self.value) + for key in keys: + yield (depth, 'key', key) + for child in self.branches[key].getContent(depth + 1): + yield child + + def toJSON(self): + ''' + Returns this Tree as a JSON-able tree of hashes. + Only the values need to take care that they're JSON-able. + ''' + json = {} + keys = self.branches.keys() + keys.sort() + if self.value is not None: + json['value'] = self.value + children = [('/'.join(key), self.branches[key].toJSON()) + for key in keys] + if children: + json['children'] = children + return json + + def getStrRows(self): + def tostr(t): + if t[1] == 'key': + return self.indent * t[0] + '/'.join(t[2]) + return self.indent * (t[0] + 1) + str(t[2]) + + return map(tostr, self.getContent()) + + def __str__(self): + return '\n'.join(self.getStrRows()) + + +class AddRemove(SequenceMatcher): + def __init__(self): + SequenceMatcher.__init__(self, None, None, None) + + def set_left(self, left): + if not isinstance(left, list): + left = [l for l in left] + self.set_seq1(left) + + def set_right(self, right): + if not isinstance(right, list): + right = [l for l in right] + self.set_seq2(right) + + def __iter__(self): + for tag, i1, i2, j1, j2 in self.get_opcodes(): + if tag == 'equal': + for pair in zip(self.a[i1:i2], self.b[j1:j2]): + yield ('equal', pair) + elif tag == 'delete': + for item in self.a[i1:i2]: + yield ('delete', item) + elif tag == 'insert': + for item in self.b[j1:j2]: + yield ('add', item) + else: + # tag == 'replace' + for item in self.a[i1:i2]: + yield ('delete', item) + for item in self.b[j1:j2]: + yield ('add', item) + + +class DirectoryCompare(SequenceMatcher): + def __init__(self, reference): + SequenceMatcher.__init__(self, None, [i for i in reference], + []) + self.watcher = None + + def setWatcher(self, watcher): + self.watcher = watcher + + def compareWith(self, other): + if not self.watcher: + return + self.set_seq2([i for i in other]) + for tag, i1, i2, j1, j2 in self.get_opcodes(): + if tag == 'equal': + for i, j in zip(xrange(i1, i2), xrange(j1, j2)): + self.watcher.compare(self.a[i], self.b[j]) + elif tag == 'delete': + for i in xrange(i1, i2): + self.watcher.add(self.a[i], other.cloneFile(self.a[i])) + elif tag == 'insert': + for j in xrange(j1, j2): + self.watcher.remove(self.b[j]) + else: + for j in xrange(j1, j2): + self.watcher.remove(self.b[j]) + for i in xrange(i1, i2): + self.watcher.add(self.a[i], other.cloneFile(self.a[i])) + + +class Observer(object): + stat_cats = ['missing', 'obsolete', 'missingInFiles', 'report', + 'changed', 'unchanged', 'keys'] + + def __init__(self): + class intdict(defaultdict): + def __init__(self): + defaultdict.__init__(self, int) + + self.summary = defaultdict(intdict) + self.details = Tree(dict) + self.filter = None + + # support pickling + def __getstate__(self): + return dict(summary=self.getSummary(), details=self.details) + + def __setstate__(self, state): + class intdict(defaultdict): + def __init__(self): + defaultdict.__init__(self, int) + + self.summary = defaultdict(intdict) + if 'summary' in state: + for loc, stats in state['summary'].iteritems(): + self.summary[loc].update(stats) + self.details = state['details'] + self.filter = None + + def getSummary(self): + plaindict = {} + for k, v in self.summary.iteritems(): + plaindict[k] = dict(v) + return plaindict + + def toJSON(self): + return dict(summary=self.getSummary(), details=self.details.toJSON()) + + def notify(self, category, file, data): + rv = "error" + if category in self.stat_cats: + # these get called post reporting just for stats + # return "error" to forward them to other other_observers + self.summary[file.locale][category] += data + # keep track of how many strings are in a missing file + # we got the {'missingFile': 'error'} from the first pass + if category == 'missingInFiles': + self.details[file]['strings'] = data + return "error" + if category in ['missingFile', 'obsoleteFile']: + if self.filter is not None: + rv = self.filter(file) + if rv != "ignore": + self.details[file][category] = rv + return rv + if category in ['missingEntity', 'obsoleteEntity']: + if self.filter is not None: + rv = self.filter(file, data) + if rv == "ignore": + return rv + v = self.details[file] + try: + v[category].append(data) + except KeyError: + v[category] = [data] + return rv + if category == 'error': + try: + self.details[file][category].append(data) + except KeyError: + self.details[file][category] = [data] + self.summary[file.locale]['errors'] += 1 + elif category == 'warning': + try: + self.details[file][category].append(data) + except KeyError: + self.details[file][category] = [data] + self.summary[file.locale]['warnings'] += 1 + return rv + + def toExhibit(self): + items = [] + for locale in sorted(self.summary.iterkeys()): + summary = self.summary[locale] + if locale is not None: + item = {'id': 'xxx/' + locale, + 'label': locale, + 'locale': locale} + else: + item = {'id': 'xxx', + 'label': 'xxx', + 'locale': 'xxx'} + item['type'] = 'Build' + total = sum([summary[k] + for k in ('changed', 'unchanged', 'report', 'missing', + 'missingInFiles') + if k in summary]) + rate = (('changed' in summary and summary['changed'] * 100) or + 0) / total + item.update((k, summary.get(k, 0)) + for k in ('changed', 'unchanged')) + item.update((k, summary[k]) + for k in ('report', 'errors', 'warnings') + if k in summary) + item['missing'] = summary.get('missing', 0) + \ + summary.get('missingInFiles', 0) + item['completion'] = rate + item['total'] = total + result = 'success' + if item.get('warnings', 0): + result = 'warning' + if item.get('errors', 0) or item.get('missing', 0): + result = 'failure' + item['result'] = result + items.append(item) + data = { + "properties": dict.fromkeys( + ("completion", "errors", "warnings", "missing", "report", + "unchanged", "changed", "obsolete"), + {"valueType": "number"}), + "types": { + "Build": {"pluralLabel": "Builds"} + }} + data['items'] = items + return dumps(data, indent=2) + + def serialize(self, type="text"): + if type == "exhibit": + return self.toExhibit() + if type == "json": + return dumps(self.toJSON()) + + def tostr(t): + if t[1] == 'key': + return ' ' * t[0] + '/'.join(t[2]) + o = [] + indent = ' ' * (t[0] + 1) + if 'error' in t[2]: + o += [indent + 'ERROR: ' + e for e in t[2]['error']] + if 'warning' in t[2]: + o += [indent + 'WARNING: ' + e for e in t[2]['warning']] + if 'missingEntity' in t[2] or 'obsoleteEntity' in t[2]: + missingEntities = ('missingEntity' in t[2] and + t[2]['missingEntity']) or [] + obsoleteEntities = ('obsoleteEntity' in t[2] and + t[2]['obsoleteEntity']) or [] + entities = missingEntities + obsoleteEntities + entities.sort() + for entity in entities: + op = '+' + if entity in obsoleteEntities: + op = '-' + o.append(indent + op + entity) + elif 'missingFile' in t[2]: + o.append(indent + '// add and localize this file') + elif 'obsoleteFile' in t[2]: + o.append(indent + '// remove this file') + return '\n'.join(o) + + out = [] + for locale, summary in sorted(self.summary.iteritems()): + if locale is not None: + out.append(locale + ':') + out += [k + ': ' + str(v) for k, v in sorted(summary.iteritems())] + total = sum([summary[k] + for k in ['changed', 'unchanged', 'report', 'missing', + 'missingInFiles'] + if k in summary]) + rate = 0 + if total: + rate = (('changed' in summary and summary['changed'] * 100) or + 0) / total + out.append('%d%% of entries changed' % rate) + return '\n'.join(map(tostr, self.details.getContent()) + out) + + def __str__(self): + return 'observer' + + +class ContentComparer: + keyRE = re.compile('[kK]ey') + nl = re.compile('\n', re.M) + + def __init__(self): + '''Create a ContentComparer. + observer is usually a instance of Observer. The return values + of the notify method are used to control the handling of missing + entities. + ''' + self.reference = dict() + self.observer = Observer() + self.other_observers = [] + self.merge_stage = None + + def add_observer(self, obs): + '''Add a non-filtering observer. + Results from the notify calls are ignored. + ''' + self.other_observers.append(obs) + + def set_merge_stage(self, merge_stage): + self.merge_stage = merge_stage + + def merge(self, ref_entities, ref_map, ref_file, l10n_file, missing, + skips, p): + outfile = os.path.join(self.merge_stage, l10n_file.module, + l10n_file.file) + outdir = os.path.dirname(outfile) + if not os.path.isdir(outdir): + os.makedirs(outdir) + if not p.canMerge: + shutil.copyfile(ref_file.fullpath, outfile) + print "copied reference to " + outfile + return + if skips: + # skips come in ordered by key name, we need them in file order + skips.sort(key=lambda s: s.span[0]) + trailing = (['\n'] + + [ref_entities[ref_map[key]].all for key in missing] + + [ref_entities[ref_map[skip.key]].all for skip in skips + if not isinstance(skip, parser.Junk)]) + if skips: + # we need to skip a few errornous blocks in the input, copy by hand + f = codecs.open(outfile, 'wb', p.encoding) + offset = 0 + for skip in skips: + chunk = skip.span + f.write(p.contents[offset:chunk[0]]) + offset = chunk[1] + f.write(p.contents[offset:]) + else: + shutil.copyfile(l10n_file.fullpath, outfile) + f = codecs.open(outfile, 'ab', p.encoding) + print "adding to " + outfile + + def ensureNewline(s): + if not s.endswith('\n'): + return s + '\n' + return s + + f.write(''.join(map(ensureNewline, trailing))) + f.close() + + def notify(self, category, file, data): + """Check observer for the found data, and if it's + not to ignore, notify other_observers. + """ + rv = self.observer.notify(category, file, data) + if rv == 'ignore': + return rv + for obs in self.other_observers: + # non-filtering other_observers, ignore results + obs.notify(category, file, data) + return rv + + def remove(self, obsolete): + self.notify('obsoleteFile', obsolete, None) + pass + + def compare(self, ref_file, l10n): + try: + p = parser.getParser(ref_file.file) + except UserWarning: + # no comparison, XXX report? + return + if ref_file not in self.reference: + # we didn't parse this before + try: + p.readContents(ref_file.getContents()) + except Exception, e: + self.notify('error', ref_file, str(e)) + return + self.reference[ref_file] = p.parse() + ref = self.reference[ref_file] + ref_list = ref[1].keys() + ref_list.sort() + try: + p.readContents(l10n.getContents()) + l10n_entities, l10n_map = p.parse() + except Exception, e: + self.notify('error', l10n, str(e)) + return + lines = [] + + def _getLine(offset): + if not lines: + lines.append(0) + for m in self.nl.finditer(p.contents): + lines.append(m.end()) + for i in xrange(len(lines), 0, -1): + if offset >= lines[i - 1]: + return (i, offset - lines[i - 1]) + return (1, offset) + + l10n_list = l10n_map.keys() + l10n_list.sort() + ar = AddRemove() + ar.set_left(ref_list) + ar.set_right(l10n_list) + report = missing = obsolete = changed = unchanged = keys = 0 + missings = [] + skips = [] + checker = getChecker(l10n, reference=ref[0]) + for action, item_or_pair in ar: + if action == 'delete': + # missing entity + _rv = self.notify('missingEntity', l10n, item_or_pair) + if _rv == "ignore": + continue + if _rv == "error": + # only add to missing entities for l10n-merge on error, + # not report + missings.append(item_or_pair) + missing += 1 + else: + # just report + report += 1 + elif action == 'add': + # obsolete entity or junk + if isinstance(l10n_entities[l10n_map[item_or_pair]], + parser.Junk): + junk = l10n_entities[l10n_map[item_or_pair]] + params = (junk.val,) + junk.span + self.notify('error', l10n, + 'Unparsed content "%s" at %d-%d' % params) + if self.merge_stage is not None: + skips.append(junk) + elif self.notify('obsoleteEntity', l10n, + item_or_pair) != 'ignore': + obsolete += 1 + else: + # entity found in both ref and l10n, check for changed + entity = item_or_pair[0] + refent = ref[0][ref[1][entity]] + l10nent = l10n_entities[l10n_map[entity]] + if self.keyRE.search(entity): + keys += 1 + else: + if refent.val == l10nent.val: + self.doUnchanged(l10nent) + unchanged += 1 + else: + self.doChanged(ref_file, refent, l10nent) + changed += 1 + # run checks: + if checker: + for tp, pos, msg, cat in checker.check(refent, l10nent): + # compute real src position, if first line, + # col needs adjustment + _l, _offset = _getLine(l10nent.val_span[0]) + if isinstance(pos, tuple): + # line, column + if pos[0] == 1: + col = pos[1] + _offset + else: + col = pos[1] + _l += pos[0] - 1 + else: + _l, col = _getLine(l10nent.val_span[0] + pos) + # skip error entities when merging + if tp == 'error' and self.merge_stage is not None: + skips.append(l10nent) + self.notify(tp, l10n, + u"%s at line %d, column %d for %s" % + (msg, _l, col, refent.key)) + pass + if missing: + self.notify('missing', l10n, missing) + if self.merge_stage is not None and (missings or skips): + self.merge(ref[0], ref[1], ref_file, l10n, missings, skips, p) + if report: + self.notify('report', l10n, report) + if obsolete: + self.notify('obsolete', l10n, obsolete) + if changed: + self.notify('changed', l10n, changed) + if unchanged: + self.notify('unchanged', l10n, unchanged) + if keys: + self.notify('keys', l10n, keys) + pass + + def add(self, orig, missing): + if self.notify('missingFile', missing, None) == "ignore": + # filter said that we don't need this file, don't count it + return + f = orig + try: + p = parser.getParser(f.file) + except UserWarning: + return + try: + p.readContents(f.getContents()) + entities, map = p.parse() + except Exception, e: + self.notify('error', f, str(e)) + return + self.notify('missingInFiles', missing, len(map)) + + def doUnchanged(self, entity): + # overload this if needed + pass + + def doChanged(self, file, ref_entity, l10n_entity): + # overload this if needed + pass + + +def compareApp(app, other_observer=None, merge_stage=None, clobber=False): + '''Compare locales set in app. + + Optional arguments are: + - other_observer. A object implementing + notify(category, _file, data) + The return values of that callback are ignored. + - merge_stage. A directory to be used for staging the output of + l10n-merge. + - clobber. Clobber the module subdirectories of the merge dir as we go. + Use wisely, as it might cause data loss. + ''' + comparer = ContentComparer() + if other_observer is not None: + comparer.add_observer(other_observer) + comparer.observer.filter = app.filter + for module, reference, locales in app: + dir_comp = DirectoryCompare(reference) + dir_comp.setWatcher(comparer) + for _, localization in locales: + if merge_stage is not None: + locale_merge = merge_stage.format(ab_CD=localization.locale) + comparer.set_merge_stage(locale_merge) + if clobber: + # if clobber, remove the stage for the module if it exists + clobberdir = os.path.join(locale_merge, module) + if os.path.exists(clobberdir): + shutil.rmtree(clobberdir) + print "clobbered " + clobberdir + dir_comp.compareWith(localization) + return comparer.observer + + +def compareDirs(reference, locale, other_observer=None, merge_stage=None): + '''Compare reference and locale dir. + + Optional arguments are: + - other_observer. A object implementing + notify(category, _file, data) + The return values of that callback are ignored. + ''' + comparer = ContentComparer() + if other_observer is not None: + comparer.add_observer(other_observer) + comparer.set_merge_stage(merge_stage) + dir_comp = DirectoryCompare(paths.EnumerateDir(reference)) + dir_comp.setWatcher(comparer) + dir_comp.compareWith(paths.EnumerateDir(locale)) + return comparer.observer diff --git a/python/compare-locales/compare_locales/parser.py b/python/compare-locales/compare_locales/parser.py new file mode 100644 index 000000000..a97cf201b --- /dev/null +++ b/python/compare-locales/compare_locales/parser.py @@ -0,0 +1,521 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import re +import codecs +import logging +from HTMLParser import HTMLParser + +__constructors = [] + + +class Entity(object): + ''' + Abstraction layer for a localizable entity. + Currently supported are grammars of the form: + + 1: pre white space + 2: pre comments + 3: entity definition + 4: entity key (name) + 5: entity value + 6: post comment (and white space) in the same line (dtd only) + <--[1] + <--[2] + + + <-------[3]---------><------[6]------> + ''' + def __init__(self, contents, pp, + span, pre_ws_span, pre_comment_span, def_span, + key_span, val_span, post_span): + self.contents = contents + self.span = span + self.pre_ws_span = pre_ws_span + self.pre_comment_span = pre_comment_span + self.def_span = def_span + self.key_span = key_span + self.val_span = val_span + self.post_span = post_span + self.pp = pp + pass + + # getter helpers + + def get_all(self): + return self.contents[self.span[0]:self.span[1]] + + def get_pre_ws(self): + return self.contents[self.pre_ws_span[0]:self.pre_ws_span[1]] + + def get_pre_comment(self): + return self.contents[self.pre_comment_span[0]: + self.pre_comment_span[1]] + + def get_def(self): + return self.contents[self.def_span[0]:self.def_span[1]] + + def get_key(self): + return self.contents[self.key_span[0]:self.key_span[1]] + + def get_val(self): + return self.pp(self.contents[self.val_span[0]:self.val_span[1]]) + + def get_raw_val(self): + return self.contents[self.val_span[0]:self.val_span[1]] + + def get_post(self): + return self.contents[self.post_span[0]:self.post_span[1]] + + # getters + + all = property(get_all) + pre_ws = property(get_pre_ws) + pre_comment = property(get_pre_comment) + definition = property(get_def) + key = property(get_key) + val = property(get_val) + raw_val = property(get_raw_val) + post = property(get_post) + + def __repr__(self): + return self.key + + +class Junk(object): + ''' + An almost-Entity, representing junk data that we didn't parse. + This way, we can signal bad content as stuff we don't understand. + And the either fix that, or report real bugs in localizations. + ''' + junkid = 0 + + def __init__(self, contents, span): + self.contents = contents + self.span = span + self.pre_ws = self.pre_comment = self.definition = self.post = '' + self.__class__.junkid += 1 + self.key = '_junk_%d_%d-%d' % (self.__class__.junkid, span[0], span[1]) + + # getter helpers + def get_all(self): + return self.contents[self.span[0]:self.span[1]] + + # getters + all = property(get_all) + val = property(get_all) + + def __repr__(self): + return self.key + + +class Parser: + canMerge = True + + def __init__(self): + if not hasattr(self, 'encoding'): + self.encoding = 'utf-8' + pass + + def readFile(self, file): + f = codecs.open(file, 'r', self.encoding) + try: + self.contents = f.read() + except UnicodeDecodeError, e: + (logging.getLogger('locales') + .error("Can't read file: " + file + '; ' + str(e))) + self.contents = u'' + f.close() + + def readContents(self, contents): + (self.contents, length) = codecs.getdecoder(self.encoding)(contents) + + def parse(self): + l = [] + m = {} + for e in self: + m[e.key] = len(l) + l.append(e) + return (l, m) + + def postProcessValue(self, val): + return val + + def __iter__(self): + contents = self.contents + offset = 0 + self.header, offset = self.getHeader(contents, offset) + self.footer = '' + entity, offset = self.getEntity(contents, offset) + while entity: + yield entity + entity, offset = self.getEntity(contents, offset) + f = self.reFooter.match(contents, offset) + if f: + self.footer = f.group() + offset = f.end() + if len(contents) > offset: + yield Junk(contents, (offset, len(contents))) + pass + + def getHeader(self, contents, offset): + header = '' + h = self.reHeader.match(contents) + if h: + header = h.group() + offset = h.end() + return (header, offset) + + def getEntity(self, contents, offset): + m = self.reKey.match(contents, offset) + if m: + offset = m.end() + entity = self.createEntity(contents, m) + return (entity, offset) + # first check if footer has a non-empty match, + # 'cause then we don't find junk + m = self.reFooter.match(contents, offset) + if m and m.end() > offset: + return (None, offset) + m = self.reKey.search(contents, offset) + if m: + # we didn't match, but search, so there's junk between offset + # and start. We'll match() on the next turn + junkend = m.start() + return (Junk(contents, (offset, junkend)), junkend) + return (None, offset) + + def createEntity(self, contents, m): + return Entity(contents, self.postProcessValue, + *[m.span(i) for i in xrange(7)]) + + +def getParser(path): + for item in __constructors: + if re.search(item[0], path): + return item[1] + raise UserWarning("Cannot find Parser") + + +# Subgroups of the match will: +# 1: pre white space +# 2: pre comments +# 3: entity definition +# 4: entity key (name) +# 5: entity value +# 6: post comment (and white space) in the same line (dtd only) +# <--[1] +# <--[2] +# +# +# <-------[3]---------><------[6]------> + + +class DTDParser(Parser): + # http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar + # ":" | [A-Z] | "_" | [a-z] | + # [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] + # | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | + # [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | + # [#x10000-#xEFFFF] + CharMinusDash = u'\x09\x0A\x0D\u0020-\u002C\u002E-\uD7FF\uE000-\uFFFD' + XmlComment = '' % CharMinusDash + NameStartChar = u':A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\u02FF' + \ + u'\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F' + \ + u'\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD' + # + \U00010000-\U000EFFFF seems to be unsupported in python + + # NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | + # [#x0300-#x036F] | [#x203F-#x2040] + NameChar = NameStartChar + ur'\-\.0-9' + u'\xB7\u0300-\u036F\u203F-\u2040' + Name = '[' + NameStartChar + '][' + NameChar + ']*' + reKey = re.compile('(?:(?P
\s*)(?P(?:' + XmlComment +
+                       '\s*)*)(?P' + Name +
+                       ')\s+(?P\"[^\"]*\"|\'[^\']*\'?)\s*>)'
+                       '(?P[ \t]*(?:' + XmlComment + '\s*)*\n?)?)',
+                       re.DOTALL)
+    # add BOM to DTDs, details in bug 435002
+    reHeader = re.compile(u'^\ufeff?'
+                          u'(\s*)?', re.S)
+    reFooter = re.compile('\s*(\s*)*$')
+    rePE = re.compile('(?:(\s*)((?:' + XmlComment + '\s*)*)'
+                      '(\s*%' + Name +
+                      ';)([ \t]*(?:' + XmlComment + '\s*)*\n?)?)')
+
+    def getEntity(self, contents, offset):
+        '''
+        Overload Parser.getEntity to special-case ParsedEntities.
+        Just check for a parsed entity if that method claims junk.
+
+        
+        %foo;
+        '''
+        entity, inneroffset = Parser.getEntity(self, contents, offset)
+        if (entity and isinstance(entity, Junk)) or entity is None:
+            m = self.rePE.match(contents, offset)
+            if m:
+                inneroffset = m.end()
+                entity = Entity(contents, self.postProcessValue,
+                                *[m.span(i) for i in xrange(7)])
+        return (entity, inneroffset)
+
+    def createEntity(self, contents, m):
+        valspan = m.span('val')
+        valspan = (valspan[0]+1, valspan[1]-1)
+        return Entity(contents, self.postProcessValue, m.span(),
+                      m.span('pre'), m.span('precomment'),
+                      m.span('entity'), m.span('key'), valspan,
+                      m.span('post'))
+
+
+class PropertiesParser(Parser):
+    escape = re.compile(r'\\((?Pu[0-9a-fA-F]{1,4})|'
+                        '(?P\n\s*)|(?P.))', re.M)
+    known_escapes = {'n': '\n', 'r': '\r', 't': '\t', '\\': '\\'}
+
+    def __init__(self):
+        self.reKey = re.compile('^(\s*)'
+                                '((?:[#!].*?\n\s*)*)'
+                                '([^#!\s\n][^=:\n]*?)\s*[:=][ \t]*', re.M)
+        self.reHeader = re.compile('^\s*([#!].*\s*)+')
+        self.reFooter = re.compile('\s*([#!].*\s*)*$')
+        self._escapedEnd = re.compile(r'\\+$')
+        self._trailingWS = re.compile(r'[ \t]*$')
+        Parser.__init__(self)
+
+    def getHeader(self, contents, offset):
+        header = ''
+        h = self.reHeader.match(contents, offset)
+        if h:
+            candidate = h.group()
+            if 'http://mozilla.org/MPL/2.0/' in candidate or \
+                    'LICENSE BLOCK' in candidate:
+                header = candidate
+                offset = h.end()
+        return (header, offset)
+
+    def getEntity(self, contents, offset):
+        # overwritten to parse values line by line
+        m = self.reKey.match(contents, offset)
+        if m:
+            offset = m.end()
+            while True:
+                endval = nextline = contents.find('\n', offset)
+                if nextline == -1:
+                    endval = offset = len(contents)
+                    break
+                # is newline escaped?
+                _e = self._escapedEnd.search(contents, offset, nextline)
+                offset = nextline + 1
+                if _e is None:
+                    break
+                # backslashes at end of line, if 2*n, not escaped
+                if len(_e.group()) % 2 == 0:
+                    break
+            # strip trailing whitespace
+            ws = self._trailingWS.search(contents, m.end(), offset)
+            if ws:
+                endval -= ws.end() - ws.start()
+            entity = Entity(contents, self.postProcessValue,
+                            (m.start(), offset),   # full span
+                            m.span(1),  # leading whitespan
+                            m.span(2),  # leading comment span
+                            (m.start(3), offset),   # entity def span
+                            m.span(3),   # key span
+                            (m.end(), endval),   # value span
+                            (offset, offset))  # post comment span, empty
+            return (entity, offset)
+        m = self.reKey.search(contents, offset)
+        if m:
+            # we didn't match, but search, so there's junk between offset
+            # and start. We'll match() on the next turn
+            junkend = m.start()
+            return (Junk(contents, (offset, junkend)), junkend)
+        return (None, offset)
+
+    def postProcessValue(self, val):
+
+        def unescape(m):
+            found = m.groupdict()
+            if found['uni']:
+                return unichr(int(found['uni'][1:], 16))
+            if found['nl']:
+                return ''
+            return self.known_escapes.get(found['single'], found['single'])
+        val = self.escape.sub(unescape, val)
+        return val
+
+
+class DefinesParser(Parser):
+    # can't merge, #unfilter needs to be the last item, which we don't support
+    canMerge = False
+
+    def __init__(self):
+        self.reKey = re.compile('^(\s*)((?:^#(?!define\s).*\s*)*)'
+                                '(#define[ \t]+(\w+)[ \t]+(.*?))([ \t]*$\n?)',
+                                re.M)
+        self.reHeader = re.compile('^\s*(#(?!define\s).*\s*)*')
+        self.reFooter = re.compile('\s*(#(?!define\s).*\s*)*$', re.M)
+        Parser.__init__(self)
+
+
+class IniParser(Parser):
+    '''
+    Parse files of the form:
+    # initial comment
+    [cat]
+    whitespace*
+    #comment
+    string=value
+    ...
+    '''
+    def __init__(self):
+        self.reHeader = re.compile('^((?:\s*|[;#].*)\n)*\[.+?\]\n', re.M)
+        self.reKey = re.compile('(\s*)((?:[;#].*\n\s*)*)((.+?)=(.*))(\n?)')
+        self.reFooter = re.compile('\s*([;#].*\s*)*$')
+        Parser.__init__(self)
+
+
+DECL, COMMENT, START, END, CONTENT = range(5)
+
+
+class BookmarksParserInner(HTMLParser):
+
+    class Token(object):
+        _type = None
+        content = ''
+
+        def __str__(self):
+            return self.content
+
+    class DeclToken(Token):
+        _type = DECL
+
+        def __init__(self, decl):
+            self.content = decl
+            pass
+
+        def __str__(self):
+            return '' % self.content
+        pass
+
+    class CommentToken(Token):
+        _type = COMMENT
+
+        def __init__(self, comment):
+            self.content = comment
+            pass
+
+        def __str__(self):
+            return '' % self.content
+        pass
+
+    class StartToken(Token):
+        _type = START
+
+        def __init__(self, tag, attrs, content):
+            self.tag = tag
+            self.attrs = dict(attrs)
+            self.content = content
+            pass
+        pass
+
+    class EndToken(Token):
+        _type = END
+
+        def __init__(self, tag):
+            self.tag = tag
+            pass
+
+        def __str__(self):
+            return '' % self.tag.upper()
+        pass
+
+    class ContentToken(Token):
+        _type = CONTENT
+
+        def __init__(self, content):
+            self.content = content
+            pass
+        pass
+
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.tokens = []
+
+    def parse(self, contents):
+        self.tokens = []
+        self.feed(contents)
+        self.close()
+        return self.tokens
+
+    # Called when we hit an end DL tag to reset the folder selections
+    def handle_decl(self, decl):
+        self.tokens.append(self.DeclToken(decl))
+
+    # Called when we hit an end DL tag to reset the folder selections
+    def handle_comment(self, comment):
+        self.tokens.append(self.CommentToken(comment))
+
+    def handle_starttag(self, tag, attrs):
+        self.tokens.append(self.StartToken(tag, attrs,
+                                           self.get_starttag_text()))
+
+    # Called when text data is encountered
+    def handle_data(self, data):
+        if self.tokens[-1]._type == CONTENT:
+            self.tokens[-1].content += data
+        else:
+            self.tokens.append(self.ContentToken(data))
+
+    def handle_charref(self, data):
+        self.handle_data('&#%s;' % data)
+
+    def handle_entityref(self, data):
+        self.handle_data('&%s;' % data)
+
+    # Called when we hit an end DL tag to reset the folder selections
+    def handle_endtag(self, tag):
+        self.tokens.append(self.EndToken(tag))
+
+
+class BookmarksParser(Parser):
+    canMerge = False
+
+    class BMEntity(object):
+        def __init__(self, key, val):
+            self.key = key
+            self.val = val
+
+    def __iter__(self):
+        p = BookmarksParserInner()
+        tks = p.parse(self.contents)
+        i = 0
+        k = []
+        for i in xrange(len(tks)):
+            t = tks[i]
+            if t._type == START:
+                k.append(t.tag)
+                keys = t.attrs.keys()
+                keys.sort()
+                for attrname in keys:
+                    yield self.BMEntity('.'.join(k) + '.@' + attrname,
+                                        t.attrs[attrname])
+                if i + 1 < len(tks) and tks[i+1]._type == CONTENT:
+                    i += 1
+                    t = tks[i]
+                    v = t.content.strip()
+                    if v:
+                        yield self.BMEntity('.'.join(k), v)
+            elif t._type == END:
+                k.pop()
+
+
+__constructors = [('\\.dtd$', DTDParser()),
+                  ('\\.properties$', PropertiesParser()),
+                  ('\\.ini$', IniParser()),
+                  ('\\.inc$', DefinesParser()),
+                  ('bookmarks\\.html$', BookmarksParser())]
diff --git a/python/compare-locales/compare_locales/paths.py b/python/compare-locales/compare_locales/paths.py
new file mode 100644
index 000000000..f72b3a2e7
--- /dev/null
+++ b/python/compare-locales/compare_locales/paths.py
@@ -0,0 +1,398 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import os.path
+import os
+from ConfigParser import ConfigParser, NoSectionError, NoOptionError
+from urlparse import urlparse, urljoin
+from urllib import pathname2url, url2pathname
+from urllib2 import urlopen
+from collections import defaultdict
+from compare_locales import util
+
+
+class L10nConfigParser(object):
+    '''Helper class to gather application information from ini files.
+
+    This class is working on synchronous open to read files or web data.
+    Subclass this and overwrite loadConfigs and addChild if you need async.
+    '''
+    def __init__(self, inipath, **kwargs):
+        """Constructor for L10nConfigParsers
+
+        inipath -- l10n.ini path
+        Optional keyword arguments are fowarded to the inner ConfigParser as
+        defaults.
+        """
+        if os.path.isabs(inipath):
+            self.inipath = 'file:%s' % pathname2url(inipath)
+        else:
+            pwdurl = 'file:%s/' % pathname2url(os.getcwd())
+            self.inipath = urljoin(pwdurl, inipath)
+        # l10n.ini files can import other l10n.ini files, store the
+        # corresponding L10nConfigParsers
+        self.children = []
+        # we really only care about the l10n directories described in l10n.ini
+        self.dirs = []
+        # optional defaults to be passed to the inner ConfigParser (unused?)
+        self.defaults = kwargs
+
+    def getDepth(self, cp):
+        '''Get the depth for the comparison from the parsed l10n.ini.
+
+        Overloadable to get the source depth for fennec and friends.
+        '''
+        try:
+            depth = cp.get('general', 'depth')
+        except:
+            depth = '.'
+        return depth
+
+    def getFilters(self):
+        '''Get the test functions from this ConfigParser and all children.
+
+        Only works with synchronous loads, used by compare-locales, which
+        is local anyway.
+        '''
+        filterurl = urljoin(self.inipath, 'filter.py')
+        try:
+            l = {}
+            execfile(url2pathname(urlparse(filterurl).path), {}, l)
+            if 'test' in l and callable(l['test']):
+                filters = [l['test']]
+            else:
+                filters = []
+        except:
+            filters = []
+
+        for c in self.children:
+            filters += c.getFilters()
+
+        return filters
+
+    def loadConfigs(self):
+        """Entry point to load the l10n.ini file this Parser refers to.
+
+        This implementation uses synchronous loads, subclasses might overload
+        this behaviour. If you do, make sure to pass a file-like object
+        to onLoadConfig.
+        """
+        self.onLoadConfig(urlopen(self.inipath))
+
+    def onLoadConfig(self, inifile):
+        """Parse a file-like object for the loaded l10n.ini file."""
+        cp = ConfigParser(self.defaults)
+        cp.readfp(inifile)
+        depth = self.getDepth(cp)
+        self.baseurl = urljoin(self.inipath, depth)
+        # create child loaders for any other l10n.ini files to be included
+        try:
+            for title, path in cp.items('includes'):
+                # skip default items
+                if title in self.defaults:
+                    continue
+                # add child config parser
+                self.addChild(title, path, cp)
+        except NoSectionError:
+            pass
+        # try to load the "dirs" defined in the "compare" section
+        try:
+            self.dirs.extend(cp.get('compare', 'dirs').split())
+        except (NoOptionError, NoSectionError):
+            pass
+        # try getting a top level compare dir, as used for fennec
+        try:
+            self.tld = cp.get('compare', 'tld')
+            # remove tld from comparison dirs
+            if self.tld in self.dirs:
+                self.dirs.remove(self.tld)
+        except (NoOptionError, NoSectionError):
+            self.tld = None
+        # try to set "all_path" and "all_url"
+        try:
+            self.all_path = cp.get('general', 'all')
+            self.all_url = urljoin(self.baseurl, self.all_path)
+        except (NoOptionError, NoSectionError):
+            self.all_path = None
+            self.all_url = None
+        return cp
+
+    def addChild(self, title, path, orig_cp):
+        """Create a child L10nConfigParser and load it.
+
+        title -- indicates the module's name
+        path -- indicates the path to the module's l10n.ini file
+        orig_cp -- the configuration parser of this l10n.ini
+        """
+        cp = L10nConfigParser(urljoin(self.baseurl, path), **self.defaults)
+        cp.loadConfigs()
+        self.children.append(cp)
+
+    def getTLDPathsTuple(self, basepath):
+        """Given the basepath, return the path fragments to be used for
+        self.tld. For build runs, this is (basepath, self.tld), for
+        source runs, just (basepath,).
+
+        @see overwritten method in SourceTreeConfigParser.
+        """
+        return (basepath, self.tld)
+
+    def dirsIter(self):
+        """Iterate over all dirs and our base path for this l10n.ini"""
+        url = urlparse(self.baseurl)
+        basepath = url2pathname(url.path)
+        if self.tld is not None:
+            yield self.tld, self.getTLDPathsTuple(basepath)
+        for dir in self.dirs:
+            yield dir, (basepath, dir)
+
+    def directories(self):
+        """Iterate over all dirs and base paths for this l10n.ini as well
+        as the included ones.
+        """
+        for t in self.dirsIter():
+            yield t
+        for child in self.children:
+            for t in child.directories():
+                yield t
+
+    def allLocales(self):
+        """Return a list of all the locales of this project"""
+        return util.parseLocales(urlopen(self.all_url).read())
+
+
+class SourceTreeConfigParser(L10nConfigParser):
+    '''Subclassing L10nConfigParser to work with just the repos
+    checked out next to each other instead of intermingled like
+    we do for real builds.
+    '''
+
+    def __init__(self, inipath, basepath):
+        '''Add additional arguments basepath.
+
+        basepath is used to resolve local paths via branchnames.
+        '''
+        L10nConfigParser.__init__(self, inipath)
+        self.basepath = basepath
+        self.tld = None
+
+    def getDepth(self, cp):
+        '''Get the depth for the comparison from the parsed l10n.ini.
+
+        Overloaded to get the source depth for fennec and friends.
+        '''
+        try:
+            depth = cp.get('general', 'source-depth')
+        except:
+            try:
+                depth = cp.get('general', 'depth')
+            except:
+                depth = '.'
+        return depth
+
+    def addChild(self, title, path, orig_cp):
+        # check if there's a section with details for this include
+        # we might have to check a different repo, or even VCS
+        # for example, projects like "mail" indicate in
+        # an "include_" section where to find the l10n.ini for "toolkit"
+        details = 'include_' + title
+        if orig_cp.has_section(details):
+            branch = orig_cp.get(details, 'mozilla')
+            inipath = orig_cp.get(details, 'l10n.ini')
+            path = self.basepath + '/' + branch + '/' + inipath
+        else:
+            path = urljoin(self.baseurl, path)
+        cp = SourceTreeConfigParser(path, self.basepath, **self.defaults)
+        cp.loadConfigs()
+        self.children.append(cp)
+
+    def getTLDPathsTuple(self, basepath):
+        """Overwrite L10nConfigParser's getTLDPathsTuple to just return
+        the basepath.
+        """
+        return (basepath, )
+
+
+class File(object):
+
+    def __init__(self, fullpath, file, module=None, locale=None):
+        self.fullpath = fullpath
+        self.file = file
+        self.module = module
+        self.locale = locale
+        pass
+
+    def getContents(self):
+        # open with universal line ending support and read
+        return open(self.fullpath, 'rU').read()
+
+    def __hash__(self):
+        f = self.file
+        if self.module:
+            f = self.module + '/' + f
+        return hash(f)
+
+    def __str__(self):
+        return self.fullpath
+
+    def __cmp__(self, other):
+        if not isinstance(other, File):
+            raise NotImplementedError
+        rv = cmp(self.module, other.module)
+        if rv != 0:
+            return rv
+        return cmp(self.file, other.file)
+
+
+class EnumerateDir(object):
+    ignore_dirs = ['CVS', '.svn', '.hg', '.git']
+
+    def __init__(self, basepath, module='', locale=None, ignore_subdirs=[]):
+        self.basepath = basepath
+        self.module = module
+        self.locale = locale
+        self.ignore_subdirs = ignore_subdirs
+        pass
+
+    def cloneFile(self, other):
+        '''
+        Return a File object that this enumerator would return, if it had it.
+        '''
+        return File(os.path.join(self.basepath, other.file), other.file,
+                    self.module, self.locale)
+
+    def __iter__(self):
+        # our local dirs are given as a tuple of path segments, starting off
+        # with an empty sequence for the basepath.
+        dirs = [()]
+        while dirs:
+            dir = dirs.pop(0)
+            fulldir = os.path.join(self.basepath, *dir)
+            try:
+                entries = os.listdir(fulldir)
+            except OSError:
+                # we probably just started off in a non-existing dir, ignore
+                continue
+            entries.sort()
+            for entry in entries:
+                leaf = os.path.join(fulldir, entry)
+                if os.path.isdir(leaf):
+                    if entry not in self.ignore_dirs and \
+                        leaf not in [os.path.join(self.basepath, d)
+                                     for d in self.ignore_subdirs]:
+                        dirs.append(dir + (entry,))
+                    continue
+                yield File(leaf, '/'.join(dir + (entry,)),
+                           self.module, self.locale)
+
+
+class LocalesWrap(object):
+
+    def __init__(self, base, module, locales, ignore_subdirs=[]):
+        self.base = base
+        self.module = module
+        self.locales = locales
+        self.ignore_subdirs = ignore_subdirs
+
+    def __iter__(self):
+        for locale in self.locales:
+            path = os.path.join(self.base, locale, self.module)
+            yield (locale, EnumerateDir(path, self.module, locale,
+                                        self.ignore_subdirs))
+
+
+class EnumerateApp(object):
+    reference = 'en-US'
+
+    def __init__(self, inipath, l10nbase, locales=None):
+        self.setupConfigParser(inipath)
+        self.modules = defaultdict(dict)
+        self.l10nbase = os.path.abspath(l10nbase)
+        self.filters = []
+        drive, tail = os.path.splitdrive(inipath)
+        self.addFilters(*self.config.getFilters())
+        self.locales = locales or self.config.allLocales()
+        self.locales.sort()
+
+    def setupConfigParser(self, inipath):
+        self.config = L10nConfigParser(inipath)
+        self.config.loadConfigs()
+
+    def addFilters(self, *args):
+        self.filters += args
+
+    value_map = {None: None, 'error': 0, 'ignore': 1, 'report': 2}
+
+    def filter(self, l10n_file, entity=None):
+        '''Go through all added filters, and,
+        - map "error" -> 0, "ignore" -> 1, "report" -> 2
+        - if filter.test returns a bool, map that to
+            False -> "ignore" (1), True -> "error" (0)
+        - take the max of all reported
+        '''
+        rv = 0
+        for f in reversed(self.filters):
+            try:
+                _r = f(l10n_file.module, l10n_file.file, entity)
+            except:
+                # XXX error handling
+                continue
+            if isinstance(_r, bool):
+                _r = [1, 0][_r]
+            else:
+                # map string return value to int, default to 'error',
+                # None is None
+                _r = self.value_map.get(_r, 0)
+            if _r is not None:
+                rv = max(rv, _r)
+        return ['error', 'ignore', 'report'][rv]
+
+    def __iter__(self):
+        '''
+        Iterate over all modules, return en-US directory enumerator, and an
+        iterator over all locales in each iteration. Per locale, the locale
+        code and an directory enumerator will be given.
+        '''
+        dirmap = dict(self.config.directories())
+        mods = dirmap.keys()
+        mods.sort()
+        for mod in mods:
+            if self.reference == 'en-US':
+                base = os.path.join(*(dirmap[mod] + ('locales', 'en-US')))
+            else:
+                base = os.path.join(self.l10nbase, self.reference, mod)
+            yield (mod, EnumerateDir(base, mod, self.reference),
+                   LocalesWrap(self.l10nbase, mod, self.locales,
+                   [m[len(mod)+1:] for m in mods if m.startswith(mod+'/')]))
+
+
+class EnumerateSourceTreeApp(EnumerateApp):
+    '''Subclass EnumerateApp to work on side-by-side checked out
+    repos, and to no pay attention to how the source would actually
+    be checked out for building.
+
+    It's supporting applications like Fennec, too, which have
+    'locales/en-US/...' in their root dir, but claim to be 'mobile'.
+    '''
+
+    def __init__(self, inipath, basepath, l10nbase, locales=None):
+        self.basepath = basepath
+        EnumerateApp.__init__(self, inipath, l10nbase, locales)
+
+    def setupConfigParser(self, inipath):
+        self.config = SourceTreeConfigParser(inipath, self.basepath)
+        self.config.loadConfigs()
+
+
+def get_base_path(mod, loc):
+    'statics for path patterns and conversion'
+    __l10n = 'l10n/%(loc)s/%(mod)s'
+    __en_US = 'mozilla/%(mod)s/locales/en-US'
+    if loc == 'en-US':
+        return __en_US % {'mod': mod}
+    return __l10n % {'mod': mod, 'loc': loc}
+
+
+def get_path(mod, loc, leaf):
+    return get_base_path(mod, loc) + '/' + leaf
diff --git a/python/compare-locales/compare_locales/tests/__init__.py b/python/compare-locales/compare_locales/tests/__init__.py
new file mode 100644
index 000000000..8808d78f4
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/__init__.py
@@ -0,0 +1,49 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+'''Mixins for parser tests.
+'''
+
+from itertools import izip_longest
+from pkg_resources import resource_string
+import re
+
+from compare_locales.parser import getParser
+
+
+class ParserTestMixin():
+    '''Utility methods used by the parser tests.
+    '''
+    filename = None
+
+    def setUp(self):
+        '''Create a parser for this test.
+        '''
+        self.parser = getParser(self.filename)
+
+    def tearDown(self):
+        'tear down this test'
+        del self.parser
+
+    def resource(self, name):
+        testcontent = resource_string(__name__, 'data/' + name)
+        # fake universal line endings
+        testcontent = re.sub('\r\n?', lambda m: '\n', testcontent)
+        return testcontent
+
+    def _test(self, content, refs):
+        '''Helper to test the parser.
+        Compares the result of parsing content with the given list
+        of reference keys and values.
+        '''
+        self.parser.readContents(content)
+        entities = [entity for entity in self.parser]
+        for entity, ref in izip_longest(entities, refs):
+            self.assertTrue(entity, 'excess reference entity')
+            self.assertTrue(ref, 'excess parsed entity')
+            self.assertEqual(entity.val, ref[1])
+            if ref[0].startswith('_junk'):
+                self.assertTrue(re.match(ref[0], entity.key))
+            else:
+                self.assertEqual(entity.key, ref[0])
diff --git a/python/compare-locales/compare_locales/tests/data/bug121341.properties b/python/compare-locales/compare_locales/tests/data/bug121341.properties
new file mode 100644
index 000000000..b45fc9698
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/data/bug121341.properties
@@ -0,0 +1,68 @@
+# simple check
+1=abc
+# test whitespace trimming in key and value
+  2	=   xy	
+# test parsing of escaped values
+3 = \u1234\t\r\n\uAB\
+\u1\n
+# test multiline properties
+4 = this is \
+multiline property
+5 = this is \
+	   another multiline property
+# property with DOS EOL
+6 = test\u0036
+# test multiline property with with DOS EOL
+7 = yet another multi\
+    line propery
+# trimming should not trim escaped whitespaces
+8 =	\ttest5\u0020	
+# another variant of #8
+9 =     \ test6\t	    
+# test UTF-8 encoded property/value
+10aሴb = c췯d
+# next property should test unicode escaping at the boundary of parsing buffer
+# buffer size is expected to be 4096 so add comments to get to this offset
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+################################################################################
+###############################################################################
+11 = \uABCD
diff --git a/python/compare-locales/compare_locales/tests/data/test.properties b/python/compare-locales/compare_locales/tests/data/test.properties
new file mode 100644
index 000000000..19cae9702
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/data/test.properties
@@ -0,0 +1,14 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+1=1
+ 2=2
+3 =3
+ 4 =4
+5=5
+6= 6
+7=7 
+8= 8 
+# this is a comment
+9=this is the first part of a continued line \
+ and here is the 2nd part
diff --git a/python/compare-locales/compare_locales/tests/data/triple-license.dtd b/python/compare-locales/compare_locales/tests/data/triple-license.dtd
new file mode 100644
index 000000000..4a28b17a6
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/data/triple-license.dtd
@@ -0,0 +1,38 @@
+
+
+
diff --git a/python/compare-locales/compare_locales/tests/test_checks.py b/python/compare-locales/compare_locales/tests/test_checks.py
new file mode 100644
index 000000000..b995d43f9
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_checks.py
@@ -0,0 +1,403 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales.checks import getChecker
+from compare_locales.parser import getParser, Entity
+from compare_locales.paths import File
+
+
+class BaseHelper(unittest.TestCase):
+    file = None
+    refContent = None
+
+    def setUp(self):
+        p = getParser(self.file.file)
+        p.readContents(self.refContent)
+        self.refList, self.refMap = p.parse()
+
+    def _test(self, content, refWarnOrErrors, with_ref_file=False):
+        p = getParser(self.file.file)
+        p.readContents(content)
+        l10n = [e for e in p]
+        assert len(l10n) == 1
+        l10n = l10n[0]
+        if with_ref_file:
+            kwargs = {
+                'reference': self.refList
+            }
+        else:
+            kwargs = {}
+        checker = getChecker(self.file, **kwargs)
+        ref = self.refList[self.refMap[l10n.key]]
+        found = tuple(checker.check(ref, l10n))
+        self.assertEqual(found, refWarnOrErrors)
+
+
+class TestProperties(BaseHelper):
+    file = File('foo.properties', 'foo.properties')
+    refContent = '''some = value
+'''
+
+    def testGood(self):
+        self._test('''some = localized''',
+                   tuple())
+
+    def testMissedEscape(self):
+        self._test(r'''some = \u67ood escape, bad \escape''',
+                   (('warning', 20, r'unknown escape sequence, \e',
+                     'escape'),))
+
+
+class TestPlurals(BaseHelper):
+    file = File('foo.properties', 'foo.properties')
+    refContent = '''\
+# LOCALIZATION NOTE (downloadsTitleFiles): Semi-colon list of plural forms.
+# See: http://developer.mozilla.org/en/docs/Localization_and_Plurals
+# #1 number of files
+# example: 111 files - Downloads
+downloadsTitleFiles=#1 file - Downloads;#1 files - #2
+'''
+
+    def testGood(self):
+        self._test('''\
+# LOCALIZATION NOTE (downloadsTitleFiles): Semi-colon list of plural forms.
+# See: http://developer.mozilla.org/en/docs/Localization_and_Plurals
+# #1 number of files
+# example: 111 files - Downloads
+downloadsTitleFiles=#1 file - Downloads;#1 files - #2;#1 filers
+''',
+                   tuple())
+
+    def testNotUsed(self):
+        self._test('''\
+# LOCALIZATION NOTE (downloadsTitleFiles): Semi-colon list of plural forms.
+# See: http://developer.mozilla.org/en/docs/Localization_and_Plurals
+# #1 number of files
+# example: 111 files - Downloads
+downloadsTitleFiles=#1 file - Downloads;#1 files - Downloads;#1 filers
+''',
+                   (('warning', 0, 'not all variables used in l10n',
+                     'plural'),))
+
+    def testNotDefined(self):
+        self._test('''\
+# LOCALIZATION NOTE (downloadsTitleFiles): Semi-colon list of plural forms.
+# See: http://developer.mozilla.org/en/docs/Localization_and_Plurals
+# #1 number of files
+# example: 111 files - Downloads
+downloadsTitleFiles=#1 file - Downloads;#1 files - #2;#1 #3
+''',
+                   (('error', 0, 'unreplaced variables in l10n', 'plural'),))
+
+
+class TestDTDs(BaseHelper):
+    file = File('foo.dtd', 'foo.dtd')
+    refContent = '''
+
+
+
+
+
+
+'''
+
+    def testWarning(self):
+        self._test('''
+''',
+                   (('warning', (0, 0), 'Referencing unknown entity `not`',
+                     'xmlparse'),))
+        # make sure we only handle translated entity references
+        self._test(u'''
+'''.encode('utf-8'),
+            (('warning', (0, 0), u'Referencing unknown entity `ƞǿŧ`',
+              'xmlparse'),))
+
+    def testErrorFirstLine(self):
+        self._test(''' stuff">
+''',
+                   (('error', (1, 10), 'mismatched tag', 'xmlparse'),))
+
+    def testErrorSecondLine(self):
+        self._test('''
+stuff">
+''',
+                   (('error', (2, 4), 'mismatched tag', 'xmlparse'),))
+
+    def testKeyErrorSingleAmpersand(self):
+        self._test('''
+''',
+                   (('error', (1, 1), 'not well-formed (invalid token)',
+                     'xmlparse'),))
+
+    def testXMLEntity(self):
+        self._test('''
+''',
+                   tuple())
+
+    def testPercentEntity(self):
+        self._test('''
+''',
+                   tuple())
+        self._test('''
+''',
+                   (('error', (0, 32), 'not well-formed (invalid token)',
+                     'xmlparse'),))
+
+    def testNoNumber(self):
+        self._test('''''',
+                   (('warning', 0, 'reference is a number', 'number'),))
+
+    def testNoLength(self):
+        self._test('''''',
+                   (('error', 0, 'reference is a CSS length', 'css'),))
+
+    def testNoStyle(self):
+        self._test('''''',
+                   (('error', 0, 'reference is a CSS spec', 'css'),))
+        self._test('''''',
+                   (('error', 0, 'reference is a CSS spec', 'css'),))
+
+    def testStyleWarnings(self):
+        self._test('''''',
+                   (('warning', 0, 'height only in reference', 'css'),))
+        self._test('''''',
+                   (('warning', 0, "units for width don't match (em != ch)",
+                     'css'),))
+
+    def testNoWarning(self):
+        self._test('''''', tuple())
+        self._test('''''', tuple())
+        self._test('''''', tuple())
+
+
+class TestEntitiesInDTDs(BaseHelper):
+    file = File('foo.dtd', 'foo.dtd')
+    refContent = '''
+
+
+
+'''
+
+    def testOK(self):
+        self._test('''''', tuple(),
+                   with_ref_file=True)
+
+    def testMismatch(self):
+        self._test('''''',
+                   (('warning', (0, 0),
+                     'Entity brandShortName referenced, '
+                     'but brandShorterName used in context',
+                     'xmlparse'),),
+                   with_ref_file=True)
+
+    def testAcross(self):
+        self._test('''''',
+                   tuple(),
+                   with_ref_file=True)
+
+    def testAcrossWithMismatch(self):
+        '''If we could tell that ent.start and ent.end are one string,
+        we should warn. Sadly, we can't, so this goes without warning.'''
+        self._test('''''',
+                   tuple(),
+                   with_ref_file=True)
+
+    def testUnknownWithRef(self):
+        self._test('''''',
+                   (('warning',
+                     (0, 0),
+                     'Referencing unknown entity `foopy` '
+                     '(brandShorterName used in context, '
+                     'brandShortName known)',
+                     'xmlparse'),),
+                   with_ref_file=True)
+
+    def testUnknown(self):
+        self._test('''''',
+                   (('warning',
+                     (0, 0),
+                     'Referencing unknown entity `foopy`'
+                     ' (brandShortName, brandShorterName known)',
+                     'xmlparse'),),
+                   with_ref_file=True)
+
+
+class TestAndroid(unittest.TestCase):
+    """Test Android checker
+
+    Make sure we're hitting our extra rules only if
+    we're passing in a DTD file in the embedding/android module.
+    """
+    apos_msg = u"Apostrophes in Android DTDs need escaping with \\' or " + \
+               u"\\u0027, or use \u2019, or put string in quotes."
+    quot_msg = u"Quotes in Android DTDs need escaping with \\\" or " + \
+               u"\\u0022, or put string in apostrophes."
+
+    def getEntity(self, v):
+        return Entity(v, lambda s: s, (0, len(v)), (), (0, 0), (), (),
+                      (0, len(v)), ())
+
+    def getDTDEntity(self, v):
+        v = v.replace('"', '"')
+        return Entity('' % v,
+                      lambda s: s,
+                      (0, len(v) + 16), (), (0, 0), (), (9, 12),
+                      (14, len(v) + 14), ())
+
+    def test_android_dtd(self):
+        """Testing the actual android checks. The logic is involved,
+        so this is a lot of nitty gritty detail tests.
+        """
+        f = File("embedding/android/strings.dtd", "strings.dtd",
+                 "embedding/android")
+        checker = getChecker(f)
+        # good string
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # dtd warning
+        l10n = self.getDTDEntity("plain localized string &ref;")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('warning', (0, 0),
+                           'Referencing unknown entity `ref`', 'xmlparse'),))
+        # no report on stray ampersand or quote, if not completely quoted
+        for i in xrange(3):
+            # make sure we're catching unescaped apostrophes,
+            # try 0..5 backticks
+            l10n = self.getDTDEntity("\\"*(2*i) + "'")
+            self.assertEqual(tuple(checker.check(ref, l10n)),
+                             (('error', 2*i, self.apos_msg, 'android'),))
+            l10n = self.getDTDEntity("\\"*(2*i + 1) + "'")
+            self.assertEqual(tuple(checker.check(ref, l10n)),
+                             ())
+            # make sure we don't report if apos string is quoted
+            l10n = self.getDTDEntity('"' + "\\"*(2*i) + "'\"")
+            tpl = tuple(checker.check(ref, l10n))
+            self.assertEqual(tpl, (),
+                             "`%s` shouldn't fail but got %s"
+                             % (l10n.val, str(tpl)))
+            l10n = self.getDTDEntity('"' + "\\"*(2*i+1) + "'\"")
+            tpl = tuple(checker.check(ref, l10n))
+            self.assertEqual(tpl, (),
+                             "`%s` shouldn't fail but got %s"
+                             % (l10n.val, str(tpl)))
+            # make sure we're catching unescaped quotes, try 0..5 backticks
+            l10n = self.getDTDEntity("\\"*(2*i) + "\"")
+            self.assertEqual(tuple(checker.check(ref, l10n)),
+                             (('error', 2*i, self.quot_msg, 'android'),))
+            l10n = self.getDTDEntity("\\"*(2*i + 1) + "'")
+            self.assertEqual(tuple(checker.check(ref, l10n)),
+                             ())
+            # make sure we don't report if quote string is single quoted
+            l10n = self.getDTDEntity("'" + "\\"*(2*i) + "\"'")
+            tpl = tuple(checker.check(ref, l10n))
+            self.assertEqual(tpl, (),
+                             "`%s` shouldn't fail but got %s" %
+                             (l10n.val, str(tpl)))
+            l10n = self.getDTDEntity('"' + "\\"*(2*i+1) + "'\"")
+            tpl = tuple(checker.check(ref, l10n))
+            self.assertEqual(tpl, (),
+                             "`%s` shouldn't fail but got %s" %
+                             (l10n.val, str(tpl)))
+        # check for mixed quotes and ampersands
+        l10n = self.getDTDEntity("'\"")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 0, self.apos_msg, 'android'),
+                          ('error', 1, self.quot_msg, 'android')))
+        l10n = self.getDTDEntity("''\"'")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 1, self.apos_msg, 'android'),))
+        l10n = self.getDTDEntity('"\'""')
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 2, self.quot_msg, 'android'),))
+
+        # broken unicode escape
+        l10n = self.getDTDEntity("Some broken \u098 unicode")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 12, 'truncated \\uXXXX escape',
+                           'android'),))
+        # broken unicode escape, try to set the error off
+        l10n = self.getDTDEntity(u"\u9690"*14+"\u006"+"  "+"\u0064")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 14, 'truncated \\uXXXX escape',
+                           'android'),))
+
+    def test_android_prop(self):
+        f = File("embedding/android/strings.properties", "strings.properties",
+                 "embedding/android")
+        checker = getChecker(f)
+        # good plain string
+        ref = self.getEntity("plain string")
+        l10n = self.getEntity("plain localized string")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # no dtd warning
+        ref = self.getEntity("plain string")
+        l10n = self.getEntity("plain localized string &ref;")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # no report on stray ampersand
+        ref = self.getEntity("plain string")
+        l10n = self.getEntity("plain localized string with apos: '")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # report on bad printf
+        ref = self.getEntity("string with %s")
+        l10n = self.getEntity("string with %S")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('error', 0, 'argument 1 `S` should be `s`',
+                           'printf'),))
+
+    def test_non_android_dtd(self):
+        f = File("browser/strings.dtd", "strings.dtd", "browser")
+        checker = getChecker(f)
+        # good string
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # dtd warning
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string &ref;")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('warning', (0, 0),
+                          'Referencing unknown entity `ref`', 'xmlparse'),))
+        # no report on stray ampersand
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string with apos: '")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+
+    def test_entities_across_dtd(self):
+        f = File("browser/strings.dtd", "strings.dtd", "browser")
+        p = getParser(f.file)
+        p.readContents('')
+        ref = p.parse()
+        checker = getChecker(f, reference=ref[0])
+        # good string
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+        # dtd warning
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string &ref;")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         (('warning', (0, 0),
+                           'Referencing unknown entity `ref` (good.ref known)',
+                           'xmlparse'),))
+        # no report on stray ampersand
+        ref = self.getDTDEntity("plain string")
+        l10n = self.getDTDEntity("plain localized string with &good.ref;")
+        self.assertEqual(tuple(checker.check(ref, l10n)),
+                         ())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/compare-locales/compare_locales/tests/test_compare.py b/python/compare-locales/compare_locales/tests/test_compare.py
new file mode 100644
index 000000000..51ba7cd8c
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_compare.py
@@ -0,0 +1,90 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales import compare
+
+
+class TestTree(unittest.TestCase):
+    '''Test the Tree utility class
+
+    Tree value classes need to be in-place editable
+    '''
+
+    def test_empty_dict(self):
+        tree = compare.Tree(dict)
+        self.assertEqual(list(tree.getContent()), [])
+        self.assertDictEqual(
+            tree.toJSON(),
+            {}
+        )
+
+    def test_disjoint_dict(self):
+        tree = compare.Tree(dict)
+        tree['one/entry']['leaf'] = 1
+        tree['two/other']['leaf'] = 2
+        self.assertEqual(
+            list(tree.getContent()),
+            [
+                (0, 'key', ('one', 'entry')),
+                (1, 'value', {'leaf': 1}),
+                (0, 'key', ('two', 'other')),
+                (1, 'value', {'leaf': 2})
+            ]
+        )
+        self.assertDictEqual(
+            tree.toJSON(),
+            {
+                'children': [
+                    ('one/entry',
+                     {'value': {'leaf': 1}}
+                     ),
+                    ('two/other',
+                     {'value': {'leaf': 2}}
+                     )
+                ]
+            }
+        )
+        self.assertMultiLineEqual(
+            str(tree),
+            '''\
+one/entry
+    {'leaf': 1}
+two/other
+    {'leaf': 2}\
+'''
+        )
+
+    def test_overlapping_dict(self):
+        tree = compare.Tree(dict)
+        tree['one/entry']['leaf'] = 1
+        tree['one/other']['leaf'] = 2
+        self.assertEqual(
+            list(tree.getContent()),
+            [
+                (0, 'key', ('one',)),
+                (1, 'key', ('entry',)),
+                (2, 'value', {'leaf': 1}),
+                (1, 'key', ('other',)),
+                (2, 'value', {'leaf': 2})
+            ]
+        )
+        self.assertDictEqual(
+            tree.toJSON(),
+            {
+                'children': [
+                    ('one', {
+                        'children': [
+                            ('entry',
+                             {'value': {'leaf': 1}}
+                             ),
+                            ('other',
+                             {'value': {'leaf': 2}}
+                             )
+                        ]
+                    })
+                ]
+            }
+        )
diff --git a/python/compare-locales/compare_locales/tests/test_dtd.py b/python/compare-locales/compare_locales/tests/test_dtd.py
new file mode 100644
index 000000000..87ddcde30
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_dtd.py
@@ -0,0 +1,86 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+'''Tests for the DTD parser.
+'''
+
+import unittest
+import re
+
+from compare_locales.parser import getParser
+from compare_locales.tests import ParserTestMixin
+
+
+class TestDTD(ParserTestMixin, unittest.TestCase):
+    '''Tests for the DTD Parser.'''
+    filename = 'foo.dtd'
+
+    def test_one_entity(self):
+        self._test('''''',
+                   (('foo.label', 'stuff'),))
+
+    quoteContent = '''
+
+
+
+
+
+
+'''
+    quoteRef = (
+        ('good.one', 'one'),
+        ('_junk_\\d_25-56$', ''),
+        ('good.two', 'two'),
+        ('_junk_\\d_82-119$', ''),
+        ('good.three', 'three'),
+        ('good.four', 'good \' quote'),
+        ('good.five', 'good \'quoted\' word'),)
+
+    def test_quotes(self):
+        self._test(self.quoteContent, self.quoteRef)
+
+    def test_apos(self):
+        qr = re.compile('[\'"]', re.M)
+
+        def quot2apos(s):
+            return qr.sub(lambda m: m.group(0) == '"' and "'" or '"', s)
+
+        self._test(quot2apos(self.quoteContent),
+                   map(lambda t: (t[0], quot2apos(t[1])), self.quoteRef))
+
+    def test_parsed_ref(self):
+        self._test('''
+  %fooDTD;
+''',
+                   (('fooDTD', '"chrome://brand.dtd"'),))
+
+    def test_trailing_comment(self):
+        self._test('''
+
+
+''',
+                   (('first', 'string'), ('second', 'string')))
+
+    def test_license_header(self):
+        p = getParser('foo.dtd')
+        p.readContents(self.resource('triple-license.dtd'))
+        for e in p:
+            self.assertEqual(e.key, 'foo')
+            self.assertEqual(e.val, 'value')
+        self.assert_('MPL' in p.header)
+        p.readContents('''\
+
+
+''')
+        for e in p:
+            self.assertEqual(e.key, 'foo')
+            self.assertEqual(e.val, 'value')
+        self.assert_('MPL' in p.header)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/compare-locales/compare_locales/tests/test_ini.py b/python/compare-locales/compare_locales/tests/test_ini.py
new file mode 100644
index 000000000..4c8cc03e1
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_ini.py
@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales.tests import ParserTestMixin
+
+
+mpl2 = '''\
+; This Source Code Form is subject to the terms of the Mozilla Public
+; License, v. 2.0. If a copy of the MPL was not distributed with this file,
+; You can obtain one at http://mozilla.org/MPL/2.0/.
+'''
+
+
+class TestIniParser(ParserTestMixin, unittest.TestCase):
+
+    filename = 'foo.ini'
+
+    def testSimpleHeader(self):
+        self._test('''; This file is in the UTF-8 encoding
+[Strings]
+TitleText=Some Title
+''', (('TitleText', 'Some Title'),))
+        self.assert_('UTF-8' in self.parser.header)
+
+    def testMPL2_Space_UTF(self):
+        self._test(mpl2 + '''
+; This file is in the UTF-8 encoding
+[Strings]
+TitleText=Some Title
+''', (('TitleText', 'Some Title'),))
+        self.assert_('MPL' in self.parser.header)
+
+    def testMPL2_Space(self):
+        self._test(mpl2 + '''
+[Strings]
+TitleText=Some Title
+''', (('TitleText', 'Some Title'),))
+        self.assert_('MPL' in self.parser.header)
+
+    def testMPL2_MultiSpace(self):
+        self._test(mpl2 + '''\
+
+; more comments
+
+[Strings]
+TitleText=Some Title
+''', (('TitleText', 'Some Title'),))
+        self.assert_('MPL' in self.parser.header)
+
+    def testMPL2_JunkBeforeCategory(self):
+        self._test(mpl2 + '''\
+Junk
+[Strings]
+TitleText=Some Title
+''', (('_junk_\\d+_0-213$', mpl2 + '''\
+Junk
+[Strings]'''), ('TitleText', 'Some Title')))
+        self.assert_('MPL' not in self.parser.header)
+
+    def test_TrailingComment(self):
+        self._test(mpl2 + '''
+[Strings]
+TitleText=Some Title
+;Stray trailing comment
+''', (('TitleText', 'Some Title'),))
+        self.assert_('MPL' in self.parser.header)
+
+    def test_SpacedTrailingComments(self):
+        self._test(mpl2 + '''
+[Strings]
+TitleText=Some Title
+
+;Stray trailing comment
+;Second stray comment
+
+''', (('TitleText', 'Some Title'),))
+        self.assert_('MPL' in self.parser.header)
+
+    def test_TrailingCommentsAndJunk(self):
+        self._test(mpl2 + '''
+[Strings]
+TitleText=Some Title
+
+;Stray trailing comment
+Junk
+;Second stray comment
+
+''', (('TitleText', 'Some Title'), ('_junk_\\d+_231-284$', '''\
+
+;Stray trailing comment
+Junk
+;Second stray comment
+
+''')))
+        self.assert_('MPL' in self.parser.header)
+
+    def test_JunkInbetweenEntries(self):
+        self._test(mpl2 + '''
+[Strings]
+TitleText=Some Title
+
+Junk
+
+Good=other string
+''', (('TitleText', 'Some Title'), ('_junk_\\d+_231-236$', '''\
+
+Junk'''), ('Good', 'other string')))
+        self.assert_('MPL' in self.parser.header)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/compare-locales/compare_locales/tests/test_merge.py b/python/compare-locales/compare_locales/tests/test_merge.py
new file mode 100644
index 000000000..c006edbb5
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_merge.py
@@ -0,0 +1,265 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+import os
+from tempfile import mkdtemp
+import shutil
+
+from compare_locales.parser import getParser
+from compare_locales.paths import File
+from compare_locales.compare import ContentComparer
+
+
+class ContentMixin(object):
+    maxDiff = None  # we got big dictionaries to compare
+    extension = None  # OVERLOAD
+
+    def reference(self, content):
+        self.ref = os.path.join(self.tmp, "en-reference" + self.extension)
+        open(self.ref, "w").write(content)
+
+    def localized(self, content):
+        self.l10n = os.path.join(self.tmp, "l10n" + self.extension)
+        open(self.l10n, "w").write(content)
+
+
+class TestProperties(unittest.TestCase, ContentMixin):
+    extension = '.properties'
+
+    def setUp(self):
+        self.tmp = mkdtemp()
+        os.mkdir(os.path.join(self.tmp, "merge"))
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp)
+        del self.tmp
+
+    def testGood(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        self.reference("""foo = fooVal
+bar = barVal
+eff = effVal""")
+        self.localized("""foo = lFoo
+bar = lBar
+eff = lEff
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.properties", ""),
+                   File(self.l10n, "l10n.properties", ""))
+        self.assertDictEqual(
+            cc.observer.toJSON(),
+            {'summary':
+                {None: {
+                    'changed': 3
+                }},
+             'details': {}
+             }
+        )
+        self.assert_(not os.path.exists(os.path.join(cc.merge_stage,
+                                                     'l10n.properties')))
+
+    def testMissing(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        self.reference("""foo = fooVal
+bar = barVal
+eff = effVal""")
+        self.localized("""bar = lBar
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.properties", ""),
+                   File(self.l10n, "l10n.properties", ""))
+        self.assertDictEqual(
+            cc.observer.toJSON(),
+            {'summary':
+                {None: {
+                    'changed': 1, 'missing': 2
+                }},
+             'details': {
+                 'children': [
+                     ('l10n.properties',
+                         {'value': {'missingEntity': [u'eff', u'foo']}}
+                      )
+                 ]}
+             }
+        )
+        mergefile = os.path.join(self.tmp, "merge", "l10n.properties")
+        self.assertTrue(os.path.isfile(mergefile))
+        p = getParser(mergefile)
+        p.readFile(mergefile)
+        [m, n] = p.parse()
+        self.assertEqual(map(lambda e: e.key,  m), ["bar", "eff", "foo"])
+
+    def testError(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        self.reference("""foo = fooVal
+bar = %d barVal
+eff = effVal""")
+        self.localized("""bar = %S lBar
+eff = leffVal
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.properties", ""),
+                   File(self.l10n, "l10n.properties", ""))
+        self.assertDictEqual(
+            cc.observer.toJSON(),
+            {'summary':
+                {None: {
+                    'changed': 2, 'errors': 1, 'missing': 1
+                }},
+             'details': {
+                 'children': [
+                     ('l10n.properties',
+                         {'value': {
+                          'error': [u'argument 1 `S` should be `d` '
+                                    u'at line 1, column 6 for bar'],
+                          'missingEntity': [u'foo']}}
+                      )
+                 ]}
+             }
+        )
+        mergefile = os.path.join(self.tmp, "merge", "l10n.properties")
+        self.assertTrue(os.path.isfile(mergefile))
+        p = getParser(mergefile)
+        p.readFile(mergefile)
+        [m, n] = p.parse()
+        self.assertEqual([e.key for e in m], ["eff", "foo", "bar"])
+        self.assertEqual(m[n['bar']].val, '%d barVal')
+
+    def testObsolete(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        self.reference("""foo = fooVal
+eff = effVal""")
+        self.localized("""foo = fooVal
+other = obsolete
+eff = leffVal
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.properties", ""),
+                   File(self.l10n, "l10n.properties", ""))
+        self.assertDictEqual(
+            cc.observer.toJSON(),
+            {'summary':
+                {None: {
+                    'changed': 1, 'obsolete': 1, 'unchanged': 1
+                }},
+             'details': {
+                 'children': [
+                     ('l10n.properties',
+                         {'value': {'obsoleteEntity': [u'other']}})]},
+             }
+        )
+
+
+class TestDTD(unittest.TestCase, ContentMixin):
+    extension = '.dtd'
+
+    def setUp(self):
+        self.tmp = mkdtemp()
+        os.mkdir(os.path.join(self.tmp, "merge"))
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp)
+        del self.tmp
+
+    def testGood(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        self.reference("""
+
+""")
+        self.localized("""
+
+
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.dtd", ""),
+                   File(self.l10n, "l10n.dtd", ""))
+        self.assertDictEqual(
+            cc.observer.toJSON(),
+            {'summary':
+                {None: {
+                    'changed': 3
+                }},
+             'details': {}
+             }
+        )
+        self.assert_(
+            not os.path.exists(os.path.join(cc.merge_stage, 'l10n.dtd')))
+
+    def testMissing(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        self.reference("""
+
+""")
+        self.localized("""
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.dtd", ""),
+                   File(self.l10n, "l10n.dtd", ""))
+        self.assertDictEqual(
+            cc.observer.toJSON(),
+            {'summary':
+                {None: {
+                    'changed': 1, 'missing': 2
+                }},
+             'details': {
+                 'children': [
+                     ('l10n.dtd',
+                         {'value': {'missingEntity': [u'eff', u'foo']}}
+                      )
+                 ]}
+             }
+        )
+        mergefile = os.path.join(self.tmp, "merge", "l10n.dtd")
+        self.assertTrue(os.path.isfile(mergefile))
+        p = getParser(mergefile)
+        p.readFile(mergefile)
+        [m, n] = p.parse()
+        self.assertEqual(map(lambda e: e.key,  m), ["bar", "eff", "foo"])
+
+    def testJunk(self):
+        self.assertTrue(os.path.isdir(self.tmp))
+        self.reference("""
+
+""")
+        self.localized("""
+
+
+""")
+        cc = ContentComparer()
+        cc.set_merge_stage(os.path.join(self.tmp, "merge"))
+        cc.compare(File(self.ref, "en-reference.dtd", ""),
+                   File(self.l10n, "l10n.dtd", ""))
+        self.assertDictEqual(
+            cc.observer.toJSON(),
+            {'summary':
+                {None: {
+                    'errors': 1, 'missing': 1, 'unchanged': 2
+                }},
+             'details': {
+                 'children': [
+                     ('l10n.dtd',
+                         {'value': {
+                             'error': [u'Unparsed content "" at 23-44'],
+                             'missingEntity': [u'bar']}}
+                      )
+                 ]}
+             }
+        )
+        mergefile = os.path.join(self.tmp, "merge", "l10n.dtd")
+        self.assertTrue(os.path.isfile(mergefile))
+        p = getParser(mergefile)
+        p.readFile(mergefile)
+        [m, n] = p.parse()
+        self.assertEqual(map(lambda e: e.key,  m), ["foo", "eff", "bar"])
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/compare-locales/compare_locales/tests/test_properties.py b/python/compare-locales/compare_locales/tests/test_properties.py
new file mode 100644
index 000000000..331a1a57c
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_properties.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales.tests import ParserTestMixin
+
+
+class TestPropertiesParser(ParserTestMixin, unittest.TestCase):
+
+    filename = 'foo.properties'
+
+    def testBackslashes(self):
+        self._test(r'''one_line = This is one line
+two_line = This is the first \
+of two lines
+one_line_trailing = This line ends in \\
+and has junk
+two_lines_triple = This line is one of two and ends in \\\
+and still has another line coming
+''', (
+            ('one_line', 'This is one line'),
+            ('two_line', u'This is the first of two lines'),
+            ('one_line_trailing', u'This line ends in \\'),
+            ('_junk_\\d+_113-126$', 'and has junk\n'),
+            ('two_lines_triple', 'This line is one of two and ends in \\'
+             'and still has another line coming')))
+
+    def testProperties(self):
+        # port of netwerk/test/PropertiesTest.cpp
+        self.parser.readContents(self.resource('test.properties'))
+        ref = ['1', '2', '3', '4', '5', '6', '7', '8',
+               'this is the first part of a continued line '
+               'and here is the 2nd part']
+        i = iter(self.parser)
+        for r, e in zip(ref, i):
+            self.assertEqual(e.val, r)
+
+    def test_bug121341(self):
+        # port of xpcom/tests/unit/test_bug121341.js
+        self.parser.readContents(self.resource('bug121341.properties'))
+        ref = ['abc', 'xy', u"\u1234\t\r\n\u00AB\u0001\n",
+               "this is multiline property",
+               "this is another multiline property", u"test\u0036",
+               "yet another multiline propery", u"\ttest5\u0020", " test6\t",
+               u"c\uCDEFd", u"\uABCD"]
+        i = iter(self.parser)
+        for r, e in zip(ref, i):
+            self.assertEqual(e.val, r)
+
+    def test_comment_in_multi(self):
+        self._test(r'''bar=one line with a \
+# part that looks like a comment \
+and an end''', (('bar', 'one line with a # part that looks like a comment '
+                'and an end'),))
+
+    def test_license_header(self):
+        self._test('''\
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+foo=value
+''', (('foo', 'value'),))
+        self.assert_('MPL' in self.parser.header)
+
+    def test_escapes(self):
+        self.parser.readContents(r'''
+# unicode escapes
+zero = some \unicode
+one = \u0
+two = \u41
+three = \u042
+four = \u0043
+five = \u0044a
+six = \a
+seven = \n\r\t\\
+''')
+        ref = ['some unicode', chr(0), 'A', 'B', 'C', 'Da', 'a', '\n\r\t\\']
+        for r, e in zip(ref, self.parser):
+            self.assertEqual(e.val, r)
+
+    def test_trailing_comment(self):
+        self._test('''first = string
+second = string
+
+#
+#commented out
+''', (('first', 'string'), ('second', 'string')))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/compare-locales/compare_locales/tests/test_util.py b/python/compare-locales/compare_locales/tests/test_util.py
new file mode 100644
index 000000000..fd2d2c92b
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_util.py
@@ -0,0 +1,29 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales import util
+
+
+class ParseLocalesTest(unittest.TestCase):
+    def test_empty(self):
+        self.assertEquals(util.parseLocales(''), [])
+
+    def test_all(self):
+        self.assertEquals(util.parseLocales('''af
+de'''), ['af', 'de'])
+
+    def test_shipped(self):
+        self.assertEquals(util.parseLocales('''af
+ja win mac
+de'''), ['af', 'de', 'ja'])
+
+    def test_sparse(self):
+        self.assertEquals(util.parseLocales('''
+af
+
+de
+
+'''), ['af', 'de'])
diff --git a/python/compare-locales/compare_locales/tests/test_webapps.py b/python/compare-locales/compare_locales/tests/test_webapps.py
new file mode 100644
index 000000000..2f1223649
--- /dev/null
+++ b/python/compare-locales/compare_locales/tests/test_webapps.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import unittest
+
+from compare_locales import webapps
+
+
+class TestFileComparison(unittest.TestCase):
+
+    def mock_FileComparison(self, mock_listdir):
+        class Target(webapps.FileComparison):
+            def _listdir(self):
+                return mock_listdir()
+        return Target('.', 'en-US')
+
+    def test_just_reference(self):
+        def _listdir():
+            return ['my_app.en-US.properties']
+        filecomp = self.mock_FileComparison(_listdir)
+        filecomp.files()
+        self.assertEqual(filecomp.locales(), [])
+        self.assertEqual(filecomp._reference.keys(), ['my_app'])
+        file_ = filecomp._reference['my_app']
+        self.assertEqual(file_.file, 'locales/my_app.en-US.properties')
+
+    def test_just_locales(self):
+        def _listdir():
+            return ['my_app.ar.properties',
+                    'my_app.sr-Latn.properties',
+                    'my_app.sv-SE.properties',
+                    'my_app.po_SI.properties']
+        filecomp = self.mock_FileComparison(_listdir)
+        filecomp.files()
+        self.assertEqual(filecomp.locales(),
+                         ['ar', 'sr-Latn', 'sv-SE'])
+        self.assertEqual(filecomp._files['ar'].keys(), ['my_app'])
+        file_ = filecomp._files['ar']['my_app']
+        self.assertEqual(file_.file, 'locales/my_app.ar.properties')
diff --git a/python/compare-locales/compare_locales/util.py b/python/compare-locales/compare_locales/util.py
new file mode 100644
index 000000000..71eadd874
--- /dev/null
+++ b/python/compare-locales/compare_locales/util.py
@@ -0,0 +1,11 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+# This file is shared between compare-locales and locale-inspector
+# test_util is in compare-locales only, for the sake of easy
+# development.
+
+
+def parseLocales(content):
+    return sorted(l.split()[0] for l in content.splitlines() if l)
diff --git a/python/compare-locales/compare_locales/webapps.py b/python/compare-locales/compare_locales/webapps.py
new file mode 100644
index 000000000..42f5b5657
--- /dev/null
+++ b/python/compare-locales/compare_locales/webapps.py
@@ -0,0 +1,235 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+'''gaia-style web apps support
+
+This variant supports manifest.webapp localization as well as
+.properties files with a naming scheme of locales/foo.*.properties.
+'''
+
+from collections import defaultdict
+import json
+import os
+import os.path
+import re
+
+from compare_locales.paths import File, EnumerateDir
+from compare_locales.compare import AddRemove, ContentComparer
+
+
+class WebAppCompare(object):
+    '''For a given directory, analyze
+    /manifest.webapp
+    /locales/*.*.properties
+
+    Deduce the present locale codes.
+    '''
+    ignore_dirs = EnumerateDir.ignore_dirs
+    reference_locale = 'en-US'
+
+    def __init__(self, basedir):
+        '''Constructor
+        :param basedir: Directory of the web app to inspect
+        '''
+        self.basedir = basedir
+        self.manifest = Manifest(basedir, self.reference_locale)
+        self.files = FileComparison(basedir, self.reference_locale)
+        self.watcher = None
+
+    def compare(self, locales):
+        '''Compare the manifest.webapp and the locales/*.*.properties
+        '''
+        if not locales:
+            locales = self.locales()
+        self.manifest.compare(locales)
+        self.files.compare(locales)
+
+    def setWatcher(self, watcher):
+        self.watcher = watcher
+        self.manifest.watcher = watcher
+        self.files.watcher = watcher
+
+    def locales(self):
+        '''Inspect files on disk to find present languages.
+        :rtype: List of locales, sorted, including reference.
+        '''
+        locales = set(self.manifest.strings.keys())
+        locales.update(self.files.locales())
+        locales = list(sorted(locales))
+        return locales
+
+
+class Manifest(object):
+    '''Class that helps with parsing and inspection of manifest.webapp.
+    '''
+
+    def __init__(self, basedir, reference_locale):
+        self.file = File(os.path.join(basedir, 'manifest.webapp'),
+                         'manifest.webapp')
+        self.reference_locale = reference_locale
+        self._strings = None
+        self.watcher = None
+
+    @property
+    def strings(self):
+        if self._strings is None:
+            self._strings = self.load_and_parse()
+        return self._strings
+
+    def load_and_parse(self):
+        try:
+            manifest = json.load(open(self.file.fullpath))
+        except (ValueError, IOError), e:
+            if self.watcher:
+                self.watcher.notify('error', self.file, str(e))
+            return False
+        return self.extract_manifest_strings(manifest)
+
+    def extract_manifest_strings(self, manifest_fragment):
+        '''Extract localizable strings from a manifest dict.
+        This method is recursive, and returns a two-level dict,
+        first level being locale codes, second level being generated
+        key and localized value. Keys are generated by concatenating
+        each level in the json with a ".".
+        '''
+        rv = defaultdict(dict)
+        localizable = manifest_fragment.pop('locales', {})
+        if localizable:
+            for locale, keyvalue in localizable.iteritems():
+                for key, value in keyvalue.iteritems():
+                    key = '.'.join(['locales', 'AB_CD', key])
+                    rv[locale][key] = value
+        for key, sub_manifest in manifest_fragment.iteritems():
+            if not isinstance(sub_manifest, dict):
+                continue
+            subdict = self.extract_manifest_strings(sub_manifest)
+            if subdict:
+                for locale, keyvalue in subdict:
+                    rv[locale].update((key + '.' + subkey, value)
+                                      for subkey, value
+                                      in keyvalue.iteritems())
+        return rv
+
+    def compare(self, locales):
+        strings = self.strings
+        if not strings:
+            return
+        # create a copy so that we can mock around with it
+        strings = strings.copy()
+        reference = strings.pop(self.reference_locale)
+        for locale in locales:
+            if locale == self.reference_locale:
+                continue
+            self.compare_strings(reference,
+                                 strings.get(locale, {}),
+                                 locale)
+
+    def compare_strings(self, reference, l10n, locale):
+        add_remove = AddRemove()
+        add_remove.set_left(sorted(reference.keys()))
+        add_remove.set_right(sorted(l10n.keys()))
+        missing = obsolete = changed = unchanged = 0
+        for op, item_or_pair in add_remove:
+            if op == 'equal':
+                if reference[item_or_pair[0]] == l10n[item_or_pair[1]]:
+                    unchanged += 1
+                else:
+                    changed += 1
+            else:
+                key = item_or_pair.replace('.AB_CD.',
+                                           '.%s.' % locale)
+                if op == 'add':
+                    # obsolete entry
+                    obsolete += 1
+                    self.watcher.notify('obsoleteEntity', self.file, key)
+                else:
+                    # missing entry
+                    missing += 1
+                    self.watcher.notify('missingEntity', self.file, key)
+
+
+class FileComparison(object):
+    '''Compare the locales/*.*.properties files inside a webapp.
+    '''
+    prop = re.compile('(?P.*)\\.'
+                      '(?P[a-zA-Z]+(?:-[a-zA-Z]+)*)'
+                      '\\.properties$')
+
+    def __init__(self, basedir, reference_locale):
+        self.basedir = basedir
+        self.reference_locale = reference_locale
+        self.watcher = None
+        self._reference = self._files = None
+
+    def locales(self):
+        '''Get the locales present in the webapp
+        '''
+        self.files()
+        locales = self._files.keys()
+        locales.sort()
+        return locales
+
+    def compare(self, locales):
+        self.files()
+        for locale in locales:
+            l10n = self._files[locale]
+            filecmp = AddRemove()
+            filecmp.set_left(sorted(self._reference.keys()))
+            filecmp.set_right(sorted(l10n.keys()))
+            for op, item_or_pair in filecmp:
+                if op == 'equal':
+                    self.watcher.compare(self._reference[item_or_pair[0]],
+                                         l10n[item_or_pair[1]])
+                elif op == 'add':
+                    # obsolete file
+                    self.watcher.remove(l10n[item_or_pair])
+                else:
+                    # missing file
+                    _path = '.'.join([item_or_pair, locale, 'properties'])
+                    missingFile = File(
+                        os.path.join(self.basedir, 'locales', _path),
+                        'locales/' + _path)
+                    self.watcher.add(self._reference[item_or_pair],
+                                     missingFile)
+
+    def files(self):
+        '''Read the list of locales from disk.
+        '''
+        if self._reference:
+            return
+        self._reference = {}
+        self._files = defaultdict(dict)
+        path_list = self._listdir()
+        for path in path_list:
+            match = self.prop.match(path)
+            if match is None:
+                continue
+            locale = match.group('locale')
+            if locale == self.reference_locale:
+                target = self._reference
+            else:
+                target = self._files[locale]
+            fullpath = os.path.join(self.basedir, 'locales', path)
+            target[match.group('base')] = File(fullpath, 'locales/' + path)
+
+    def _listdir(self):
+        'Monkey-patch this for testing.'
+        return os.listdir(os.path.join(self.basedir, 'locales'))
+
+
+def compare_web_app(basedir, locales, other_observer=None):
+    '''Compare gaia-style web app.
+
+    Optional arguments are:
+    - other_observer. A object implementing
+        notify(category, _file, data)
+      The return values of that callback are ignored.
+    '''
+    comparer = ContentComparer()
+    if other_observer is not None:
+        comparer.add_observer(other_observer)
+    webapp_comp = WebAppCompare(basedir)
+    webapp_comp.setWatcher(comparer)
+    webapp_comp.compare(locales)
+    return comparer.observer
-- 
cgit v1.2.3