summaryrefslogtreecommitdiffstats
path: root/python/compare-locales/compare_locales/compare.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/compare-locales/compare_locales/compare.py')
-rw-r--r--python/compare-locales/compare_locales/compare.py638
1 files changed, 638 insertions, 0 deletions
diff --git a/python/compare-locales/compare_locales/compare.py b/python/compare-locales/compare_locales/compare.py
new file mode 100644
index 000000000..4f71c46f8
--- /dev/null
+++ b/python/compare-locales/compare_locales/compare.py
@@ -0,0 +1,638 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+'Mozilla l10n compare locales tool'
+
+import codecs
+import os
+import os.path
+import shutil
+import re
+from difflib import SequenceMatcher
+from collections import defaultdict
+
+try:
+ from json import dumps
+except:
+ from simplejson import dumps
+
+from compare_locales import parser
+from compare_locales import paths
+from compare_locales.checks import getChecker
+
+
+class Tree(object):
+ def __init__(self, valuetype):
+ self.branches = dict()
+ self.valuetype = valuetype
+ self.value = None
+
+ def __getitem__(self, leaf):
+ parts = []
+ if isinstance(leaf, paths.File):
+ parts = [p for p in [leaf.locale, leaf.module] if p] + \
+ leaf.file.split('/')
+ else:
+ parts = leaf.split('/')
+ return self.__get(parts)
+
+ def __get(self, parts):
+ common = None
+ old = None
+ new = tuple(parts)
+ t = self
+ for k, v in self.branches.iteritems():
+ for i, part in enumerate(zip(k, parts)):
+ if part[0] != part[1]:
+ i -= 1
+ break
+ if i < 0:
+ continue
+ i += 1
+ common = tuple(k[:i])
+ old = tuple(k[i:])
+ new = tuple(parts[i:])
+ break
+ if old:
+ self.branches.pop(k)
+ t = Tree(self.valuetype)
+ t.branches[old] = v
+ self.branches[common] = t
+ elif common:
+ t = self.branches[common]
+ if new:
+ if common:
+ return t.__get(new)
+ t2 = t
+ t = Tree(self.valuetype)
+ t2.branches[new] = t
+ if t.value is None:
+ t.value = t.valuetype()
+ return t.value
+
+ indent = ' '
+
+ def getContent(self, depth=0):
+ '''
+ Returns iterator of (depth, flag, key_or_value) tuples.
+ If flag is 'value', key_or_value is a value object, otherwise
+ (flag is 'key') it's a key string.
+ '''
+ keys = self.branches.keys()
+ keys.sort()
+ if self.value is not None:
+ yield (depth, 'value', self.value)
+ for key in keys:
+ yield (depth, 'key', key)
+ for child in self.branches[key].getContent(depth + 1):
+ yield child
+
+ def toJSON(self):
+ '''
+ Returns this Tree as a JSON-able tree of hashes.
+ Only the values need to take care that they're JSON-able.
+ '''
+ json = {}
+ keys = self.branches.keys()
+ keys.sort()
+ if self.value is not None:
+ json['value'] = self.value
+ children = [('/'.join(key), self.branches[key].toJSON())
+ for key in keys]
+ if children:
+ json['children'] = children
+ return json
+
+ def getStrRows(self):
+ def tostr(t):
+ if t[1] == 'key':
+ return self.indent * t[0] + '/'.join(t[2])
+ return self.indent * (t[0] + 1) + str(t[2])
+
+ return map(tostr, self.getContent())
+
+ def __str__(self):
+ return '\n'.join(self.getStrRows())
+
+
+class AddRemove(SequenceMatcher):
+ def __init__(self):
+ SequenceMatcher.__init__(self, None, None, None)
+
+ def set_left(self, left):
+ if not isinstance(left, list):
+ left = [l for l in left]
+ self.set_seq1(left)
+
+ def set_right(self, right):
+ if not isinstance(right, list):
+ right = [l for l in right]
+ self.set_seq2(right)
+
+ def __iter__(self):
+ for tag, i1, i2, j1, j2 in self.get_opcodes():
+ if tag == 'equal':
+ for pair in zip(self.a[i1:i2], self.b[j1:j2]):
+ yield ('equal', pair)
+ elif tag == 'delete':
+ for item in self.a[i1:i2]:
+ yield ('delete', item)
+ elif tag == 'insert':
+ for item in self.b[j1:j2]:
+ yield ('add', item)
+ else:
+ # tag == 'replace'
+ for item in self.a[i1:i2]:
+ yield ('delete', item)
+ for item in self.b[j1:j2]:
+ yield ('add', item)
+
+
+class DirectoryCompare(SequenceMatcher):
+ def __init__(self, reference):
+ SequenceMatcher.__init__(self, None, [i for i in reference],
+ [])
+ self.watcher = None
+
+ def setWatcher(self, watcher):
+ self.watcher = watcher
+
+ def compareWith(self, other):
+ if not self.watcher:
+ return
+ self.set_seq2([i for i in other])
+ for tag, i1, i2, j1, j2 in self.get_opcodes():
+ if tag == 'equal':
+ for i, j in zip(xrange(i1, i2), xrange(j1, j2)):
+ self.watcher.compare(self.a[i], self.b[j])
+ elif tag == 'delete':
+ for i in xrange(i1, i2):
+ self.watcher.add(self.a[i], other.cloneFile(self.a[i]))
+ elif tag == 'insert':
+ for j in xrange(j1, j2):
+ self.watcher.remove(self.b[j])
+ else:
+ for j in xrange(j1, j2):
+ self.watcher.remove(self.b[j])
+ for i in xrange(i1, i2):
+ self.watcher.add(self.a[i], other.cloneFile(self.a[i]))
+
+
+class Observer(object):
+ stat_cats = ['missing', 'obsolete', 'missingInFiles', 'report',
+ 'changed', 'unchanged', 'keys']
+
+ def __init__(self):
+ class intdict(defaultdict):
+ def __init__(self):
+ defaultdict.__init__(self, int)
+
+ self.summary = defaultdict(intdict)
+ self.details = Tree(dict)
+ self.filter = None
+
+ # support pickling
+ def __getstate__(self):
+ return dict(summary=self.getSummary(), details=self.details)
+
+ def __setstate__(self, state):
+ class intdict(defaultdict):
+ def __init__(self):
+ defaultdict.__init__(self, int)
+
+ self.summary = defaultdict(intdict)
+ if 'summary' in state:
+ for loc, stats in state['summary'].iteritems():
+ self.summary[loc].update(stats)
+ self.details = state['details']
+ self.filter = None
+
+ def getSummary(self):
+ plaindict = {}
+ for k, v in self.summary.iteritems():
+ plaindict[k] = dict(v)
+ return plaindict
+
+ def toJSON(self):
+ return dict(summary=self.getSummary(), details=self.details.toJSON())
+
+ def notify(self, category, file, data):
+ rv = "error"
+ if category in self.stat_cats:
+ # these get called post reporting just for stats
+ # return "error" to forward them to other other_observers
+ self.summary[file.locale][category] += data
+ # keep track of how many strings are in a missing file
+ # we got the {'missingFile': 'error'} from the first pass
+ if category == 'missingInFiles':
+ self.details[file]['strings'] = data
+ return "error"
+ if category in ['missingFile', 'obsoleteFile']:
+ if self.filter is not None:
+ rv = self.filter(file)
+ if rv != "ignore":
+ self.details[file][category] = rv
+ return rv
+ if category in ['missingEntity', 'obsoleteEntity']:
+ if self.filter is not None:
+ rv = self.filter(file, data)
+ if rv == "ignore":
+ return rv
+ v = self.details[file]
+ try:
+ v[category].append(data)
+ except KeyError:
+ v[category] = [data]
+ return rv
+ if category == 'error':
+ try:
+ self.details[file][category].append(data)
+ except KeyError:
+ self.details[file][category] = [data]
+ self.summary[file.locale]['errors'] += 1
+ elif category == 'warning':
+ try:
+ self.details[file][category].append(data)
+ except KeyError:
+ self.details[file][category] = [data]
+ self.summary[file.locale]['warnings'] += 1
+ return rv
+
+ def toExhibit(self):
+ items = []
+ for locale in sorted(self.summary.iterkeys()):
+ summary = self.summary[locale]
+ if locale is not None:
+ item = {'id': 'xxx/' + locale,
+ 'label': locale,
+ 'locale': locale}
+ else:
+ item = {'id': 'xxx',
+ 'label': 'xxx',
+ 'locale': 'xxx'}
+ item['type'] = 'Build'
+ total = sum([summary[k]
+ for k in ('changed', 'unchanged', 'report', 'missing',
+ 'missingInFiles')
+ if k in summary])
+ rate = (('changed' in summary and summary['changed'] * 100) or
+ 0) / total
+ item.update((k, summary.get(k, 0))
+ for k in ('changed', 'unchanged'))
+ item.update((k, summary[k])
+ for k in ('report', 'errors', 'warnings')
+ if k in summary)
+ item['missing'] = summary.get('missing', 0) + \
+ summary.get('missingInFiles', 0)
+ item['completion'] = rate
+ item['total'] = total
+ result = 'success'
+ if item.get('warnings', 0):
+ result = 'warning'
+ if item.get('errors', 0) or item.get('missing', 0):
+ result = 'failure'
+ item['result'] = result
+ items.append(item)
+ data = {
+ "properties": dict.fromkeys(
+ ("completion", "errors", "warnings", "missing", "report",
+ "unchanged", "changed", "obsolete"),
+ {"valueType": "number"}),
+ "types": {
+ "Build": {"pluralLabel": "Builds"}
+ }}
+ data['items'] = items
+ return dumps(data, indent=2)
+
+ def serialize(self, type="text"):
+ if type == "exhibit":
+ return self.toExhibit()
+ if type == "json":
+ return dumps(self.toJSON())
+
+ def tostr(t):
+ if t[1] == 'key':
+ return ' ' * t[0] + '/'.join(t[2])
+ o = []
+ indent = ' ' * (t[0] + 1)
+ if 'error' in t[2]:
+ o += [indent + 'ERROR: ' + e for e in t[2]['error']]
+ if 'warning' in t[2]:
+ o += [indent + 'WARNING: ' + e for e in t[2]['warning']]
+ if 'missingEntity' in t[2] or 'obsoleteEntity' in t[2]:
+ missingEntities = ('missingEntity' in t[2] and
+ t[2]['missingEntity']) or []
+ obsoleteEntities = ('obsoleteEntity' in t[2] and
+ t[2]['obsoleteEntity']) or []
+ entities = missingEntities + obsoleteEntities
+ entities.sort()
+ for entity in entities:
+ op = '+'
+ if entity in obsoleteEntities:
+ op = '-'
+ o.append(indent + op + entity)
+ elif 'missingFile' in t[2]:
+ o.append(indent + '// add and localize this file')
+ elif 'obsoleteFile' in t[2]:
+ o.append(indent + '// remove this file')
+ return '\n'.join(o)
+
+ out = []
+ for locale, summary in sorted(self.summary.iteritems()):
+ if locale is not None:
+ out.append(locale + ':')
+ out += [k + ': ' + str(v) for k, v in sorted(summary.iteritems())]
+ total = sum([summary[k]
+ for k in ['changed', 'unchanged', 'report', 'missing',
+ 'missingInFiles']
+ if k in summary])
+ rate = 0
+ if total:
+ rate = (('changed' in summary and summary['changed'] * 100) or
+ 0) / total
+ out.append('%d%% of entries changed' % rate)
+ return '\n'.join(map(tostr, self.details.getContent()) + out)
+
+ def __str__(self):
+ return 'observer'
+
+
+class ContentComparer:
+ keyRE = re.compile('[kK]ey')
+ nl = re.compile('\n', re.M)
+
+ def __init__(self):
+ '''Create a ContentComparer.
+ observer is usually a instance of Observer. The return values
+ of the notify method are used to control the handling of missing
+ entities.
+ '''
+ self.reference = dict()
+ self.observer = Observer()
+ self.other_observers = []
+ self.merge_stage = None
+
+ def add_observer(self, obs):
+ '''Add a non-filtering observer.
+ Results from the notify calls are ignored.
+ '''
+ self.other_observers.append(obs)
+
+ def set_merge_stage(self, merge_stage):
+ self.merge_stage = merge_stage
+
+ def merge(self, ref_entities, ref_map, ref_file, l10n_file, missing,
+ skips, p):
+ outfile = os.path.join(self.merge_stage, l10n_file.module,
+ l10n_file.file)
+ outdir = os.path.dirname(outfile)
+ if not os.path.isdir(outdir):
+ os.makedirs(outdir)
+ if not p.canMerge:
+ shutil.copyfile(ref_file.fullpath, outfile)
+ print "copied reference to " + outfile
+ return
+ if skips:
+ # skips come in ordered by key name, we need them in file order
+ skips.sort(key=lambda s: s.span[0])
+ trailing = (['\n'] +
+ [ref_entities[ref_map[key]].all for key in missing] +
+ [ref_entities[ref_map[skip.key]].all for skip in skips
+ if not isinstance(skip, parser.Junk)])
+ if skips:
+ # we need to skip a few errornous blocks in the input, copy by hand
+ f = codecs.open(outfile, 'wb', p.encoding)
+ offset = 0
+ for skip in skips:
+ chunk = skip.span
+ f.write(p.contents[offset:chunk[0]])
+ offset = chunk[1]
+ f.write(p.contents[offset:])
+ else:
+ shutil.copyfile(l10n_file.fullpath, outfile)
+ f = codecs.open(outfile, 'ab', p.encoding)
+ print "adding to " + outfile
+
+ def ensureNewline(s):
+ if not s.endswith('\n'):
+ return s + '\n'
+ return s
+
+ f.write(''.join(map(ensureNewline, trailing)))
+ f.close()
+
+ def notify(self, category, file, data):
+ """Check observer for the found data, and if it's
+ not to ignore, notify other_observers.
+ """
+ rv = self.observer.notify(category, file, data)
+ if rv == 'ignore':
+ return rv
+ for obs in self.other_observers:
+ # non-filtering other_observers, ignore results
+ obs.notify(category, file, data)
+ return rv
+
+ def remove(self, obsolete):
+ self.notify('obsoleteFile', obsolete, None)
+ pass
+
+ def compare(self, ref_file, l10n):
+ try:
+ p = parser.getParser(ref_file.file)
+ except UserWarning:
+ # no comparison, XXX report?
+ return
+ if ref_file not in self.reference:
+ # we didn't parse this before
+ try:
+ p.readContents(ref_file.getContents())
+ except Exception, e:
+ self.notify('error', ref_file, str(e))
+ return
+ self.reference[ref_file] = p.parse()
+ ref = self.reference[ref_file]
+ ref_list = ref[1].keys()
+ ref_list.sort()
+ try:
+ p.readContents(l10n.getContents())
+ l10n_entities, l10n_map = p.parse()
+ except Exception, e:
+ self.notify('error', l10n, str(e))
+ return
+ lines = []
+
+ def _getLine(offset):
+ if not lines:
+ lines.append(0)
+ for m in self.nl.finditer(p.contents):
+ lines.append(m.end())
+ for i in xrange(len(lines), 0, -1):
+ if offset >= lines[i - 1]:
+ return (i, offset - lines[i - 1])
+ return (1, offset)
+
+ l10n_list = l10n_map.keys()
+ l10n_list.sort()
+ ar = AddRemove()
+ ar.set_left(ref_list)
+ ar.set_right(l10n_list)
+ report = missing = obsolete = changed = unchanged = keys = 0
+ missings = []
+ skips = []
+ checker = getChecker(l10n, reference=ref[0])
+ for action, item_or_pair in ar:
+ if action == 'delete':
+ # missing entity
+ _rv = self.notify('missingEntity', l10n, item_or_pair)
+ if _rv == "ignore":
+ continue
+ if _rv == "error":
+ # only add to missing entities for l10n-merge on error,
+ # not report
+ missings.append(item_or_pair)
+ missing += 1
+ else:
+ # just report
+ report += 1
+ elif action == 'add':
+ # obsolete entity or junk
+ if isinstance(l10n_entities[l10n_map[item_or_pair]],
+ parser.Junk):
+ junk = l10n_entities[l10n_map[item_or_pair]]
+ params = (junk.val,) + junk.span
+ self.notify('error', l10n,
+ 'Unparsed content "%s" at %d-%d' % params)
+ if self.merge_stage is not None:
+ skips.append(junk)
+ elif self.notify('obsoleteEntity', l10n,
+ item_or_pair) != 'ignore':
+ obsolete += 1
+ else:
+ # entity found in both ref and l10n, check for changed
+ entity = item_or_pair[0]
+ refent = ref[0][ref[1][entity]]
+ l10nent = l10n_entities[l10n_map[entity]]
+ if self.keyRE.search(entity):
+ keys += 1
+ else:
+ if refent.val == l10nent.val:
+ self.doUnchanged(l10nent)
+ unchanged += 1
+ else:
+ self.doChanged(ref_file, refent, l10nent)
+ changed += 1
+ # run checks:
+ if checker:
+ for tp, pos, msg, cat in checker.check(refent, l10nent):
+ # compute real src position, if first line,
+ # col needs adjustment
+ _l, _offset = _getLine(l10nent.val_span[0])
+ if isinstance(pos, tuple):
+ # line, column
+ if pos[0] == 1:
+ col = pos[1] + _offset
+ else:
+ col = pos[1]
+ _l += pos[0] - 1
+ else:
+ _l, col = _getLine(l10nent.val_span[0] + pos)
+ # skip error entities when merging
+ if tp == 'error' and self.merge_stage is not None:
+ skips.append(l10nent)
+ self.notify(tp, l10n,
+ u"%s at line %d, column %d for %s" %
+ (msg, _l, col, refent.key))
+ pass
+ if missing:
+ self.notify('missing', l10n, missing)
+ if self.merge_stage is not None and (missings or skips):
+ self.merge(ref[0], ref[1], ref_file, l10n, missings, skips, p)
+ if report:
+ self.notify('report', l10n, report)
+ if obsolete:
+ self.notify('obsolete', l10n, obsolete)
+ if changed:
+ self.notify('changed', l10n, changed)
+ if unchanged:
+ self.notify('unchanged', l10n, unchanged)
+ if keys:
+ self.notify('keys', l10n, keys)
+ pass
+
+ def add(self, orig, missing):
+ if self.notify('missingFile', missing, None) == "ignore":
+ # filter said that we don't need this file, don't count it
+ return
+ f = orig
+ try:
+ p = parser.getParser(f.file)
+ except UserWarning:
+ return
+ try:
+ p.readContents(f.getContents())
+ entities, map = p.parse()
+ except Exception, e:
+ self.notify('error', f, str(e))
+ return
+ self.notify('missingInFiles', missing, len(map))
+
+ def doUnchanged(self, entity):
+ # overload this if needed
+ pass
+
+ def doChanged(self, file, ref_entity, l10n_entity):
+ # overload this if needed
+ pass
+
+
+def compareApp(app, other_observer=None, merge_stage=None, clobber=False):
+ '''Compare locales set in app.
+
+ Optional arguments are:
+ - other_observer. A object implementing
+ notify(category, _file, data)
+ The return values of that callback are ignored.
+ - merge_stage. A directory to be used for staging the output of
+ l10n-merge.
+ - clobber. Clobber the module subdirectories of the merge dir as we go.
+ Use wisely, as it might cause data loss.
+ '''
+ comparer = ContentComparer()
+ if other_observer is not None:
+ comparer.add_observer(other_observer)
+ comparer.observer.filter = app.filter
+ for module, reference, locales in app:
+ dir_comp = DirectoryCompare(reference)
+ dir_comp.setWatcher(comparer)
+ for _, localization in locales:
+ if merge_stage is not None:
+ locale_merge = merge_stage.format(ab_CD=localization.locale)
+ comparer.set_merge_stage(locale_merge)
+ if clobber:
+ # if clobber, remove the stage for the module if it exists
+ clobberdir = os.path.join(locale_merge, module)
+ if os.path.exists(clobberdir):
+ shutil.rmtree(clobberdir)
+ print "clobbered " + clobberdir
+ dir_comp.compareWith(localization)
+ return comparer.observer
+
+
+def compareDirs(reference, locale, other_observer=None, merge_stage=None):
+ '''Compare reference and locale dir.
+
+ Optional arguments are:
+ - other_observer. A object implementing
+ notify(category, _file, data)
+ The return values of that callback are ignored.
+ '''
+ comparer = ContentComparer()
+ if other_observer is not None:
+ comparer.add_observer(other_observer)
+ comparer.set_merge_stage(merge_stage)
+ dir_comp = DirectoryCompare(paths.EnumerateDir(reference))
+ dir_comp.setWatcher(comparer)
+ dir_comp.compareWith(paths.EnumerateDir(locale))
+ return comparer.observer