# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. 'Mozilla l10n compare locales tool' import codecs import os import os.path import shutil import re from difflib import SequenceMatcher from collections import defaultdict try: from json import dumps except: from simplejson import dumps from compare_locales import parser from compare_locales import paths from compare_locales.checks import getChecker class Tree(object): def __init__(self, valuetype): self.branches = dict() self.valuetype = valuetype self.value = None def __getitem__(self, leaf): parts = [] if isinstance(leaf, paths.File): parts = [p for p in [leaf.locale, leaf.module] if p] + \ leaf.file.split('/') else: parts = leaf.split('/') return self.__get(parts) def __get(self, parts): common = None old = None new = tuple(parts) t = self for k, v in self.branches.iteritems(): for i, part in enumerate(zip(k, parts)): if part[0] != part[1]: i -= 1 break if i < 0: continue i += 1 common = tuple(k[:i]) old = tuple(k[i:]) new = tuple(parts[i:]) break if old: self.branches.pop(k) t = Tree(self.valuetype) t.branches[old] = v self.branches[common] = t elif common: t = self.branches[common] if new: if common: return t.__get(new) t2 = t t = Tree(self.valuetype) t2.branches[new] = t if t.value is None: t.value = t.valuetype() return t.value indent = ' ' def getContent(self, depth=0): ''' Returns iterator of (depth, flag, key_or_value) tuples. If flag is 'value', key_or_value is a value object, otherwise (flag is 'key') it's a key string. ''' keys = self.branches.keys() keys.sort() if self.value is not None: yield (depth, 'value', self.value) for key in keys: yield (depth, 'key', key) for child in self.branches[key].getContent(depth + 1): yield child def toJSON(self): ''' Returns this Tree as a JSON-able tree of hashes. Only the values need to take care that they're JSON-able. ''' json = {} keys = self.branches.keys() keys.sort() if self.value is not None: json['value'] = self.value children = [('/'.join(key), self.branches[key].toJSON()) for key in keys] if children: json['children'] = children return json def getStrRows(self): def tostr(t): if t[1] == 'key': return self.indent * t[0] + '/'.join(t[2]) return self.indent * (t[0] + 1) + str(t[2]) return map(tostr, self.getContent()) def __str__(self): return '\n'.join(self.getStrRows()) class AddRemove(SequenceMatcher): def __init__(self): SequenceMatcher.__init__(self, None, None, None) def set_left(self, left): if not isinstance(left, list): left = [l for l in left] self.set_seq1(left) def set_right(self, right): if not isinstance(right, list): right = [l for l in right] self.set_seq2(right) def __iter__(self): for tag, i1, i2, j1, j2 in self.get_opcodes(): if tag == 'equal': for pair in zip(self.a[i1:i2], self.b[j1:j2]): yield ('equal', pair) elif tag == 'delete': for item in self.a[i1:i2]: yield ('delete', item) elif tag == 'insert': for item in self.b[j1:j2]: yield ('add', item) else: # tag == 'replace' for item in self.a[i1:i2]: yield ('delete', item) for item in self.b[j1:j2]: yield ('add', item) class DirectoryCompare(SequenceMatcher): def __init__(self, reference): SequenceMatcher.__init__(self, None, [i for i in reference], []) self.watcher = None def setWatcher(self, watcher): self.watcher = watcher def compareWith(self, other): if not self.watcher: return self.set_seq2([i for i in other]) for tag, i1, i2, j1, j2 in self.get_opcodes(): if tag == 'equal': for i, j in zip(xrange(i1, i2), xrange(j1, j2)): self.watcher.compare(self.a[i], self.b[j]) elif tag == 'delete': for i in xrange(i1, i2): self.watcher.add(self.a[i], other.cloneFile(self.a[i])) elif tag == 'insert': for j in xrange(j1, j2): self.watcher.remove(self.b[j]) else: for j in xrange(j1, j2): self.watcher.remove(self.b[j]) for i in xrange(i1, i2): self.watcher.add(self.a[i], other.cloneFile(self.a[i])) class Observer(object): stat_cats = ['missing', 'obsolete', 'missingInFiles', 'report', 'changed', 'unchanged', 'keys'] def __init__(self): class intdict(defaultdict): def __init__(self): defaultdict.__init__(self, int) self.summary = defaultdict(intdict) self.details = Tree(dict) self.filter = None # support pickling def __getstate__(self): return dict(summary=self.getSummary(), details=self.details) def __setstate__(self, state): class intdict(defaultdict): def __init__(self): defaultdict.__init__(self, int) self.summary = defaultdict(intdict) if 'summary' in state: for loc, stats in state['summary'].iteritems(): self.summary[loc].update(stats) self.details = state['details'] self.filter = None def getSummary(self): plaindict = {} for k, v in self.summary.iteritems(): plaindict[k] = dict(v) return plaindict def toJSON(self): return dict(summary=self.getSummary(), details=self.details.toJSON()) def notify(self, category, file, data): rv = "error" if category in self.stat_cats: # these get called post reporting just for stats # return "error" to forward them to other other_observers self.summary[file.locale][category] += data # keep track of how many strings are in a missing file # we got the {'missingFile': 'error'} from the first pass if category == 'missingInFiles': self.details[file]['strings'] = data return "error" if category in ['missingFile', 'obsoleteFile']: if self.filter is not None: rv = self.filter(file) if rv != "ignore": self.details[file][category] = rv return rv if category in ['missingEntity', 'obsoleteEntity']: if self.filter is not None: rv = self.filter(file, data) if rv == "ignore": return rv v = self.details[file] try: v[category].append(data) except KeyError: v[category] = [data] return rv if category == 'error': try: self.details[file][category].append(data) except KeyError: self.details[file][category] = [data] self.summary[file.locale]['errors'] += 1 elif category == 'warning': try: self.details[file][category].append(data) except KeyError: self.details[file][category] = [data] self.summary[file.locale]['warnings'] += 1 return rv def toExhibit(self): items = [] for locale in sorted(self.summary.iterkeys()): summary = self.summary[locale] if locale is not None: item = {'id': 'xxx/' + locale, 'label': locale, 'locale': locale} else: item = {'id': 'xxx', 'label': 'xxx', 'locale': 'xxx'} item['type'] = 'Build' total = sum([summary[k] for k in ('changed', 'unchanged', 'report', 'missing', 'missingInFiles') if k in summary]) rate = (('changed' in summary and summary['changed'] * 100) or 0) / total item.update((k, summary.get(k, 0)) for k in ('changed', 'unchanged')) item.update((k, summary[k]) for k in ('report', 'errors', 'warnings') if k in summary) item['missing'] = summary.get('missing', 0) + \ summary.get('missingInFiles', 0) item['completion'] = rate item['total'] = total result = 'success' if item.get('warnings', 0): result = 'warning' if item.get('errors', 0) or item.get('missing', 0): result = 'failure' item['result'] = result items.append(item) data = { "properties": dict.fromkeys( ("completion", "errors", "warnings", "missing", "report", "unchanged", "changed", "obsolete"), {"valueType": "number"}), "types": { "Build": {"pluralLabel": "Builds"} }} data['items'] = items return dumps(data, indent=2) def serialize(self, type="text"): if type == "exhibit": return self.toExhibit() if type == "json": return dumps(self.toJSON()) def tostr(t): if t[1] == 'key': return ' ' * t[0] + '/'.join(t[2]) o = [] indent = ' ' * (t[0] + 1) if 'error' in t[2]: o += [indent + 'ERROR: ' + e for e in t[2]['error']] if 'warning' in t[2]: o += [indent + 'WARNING: ' + e for e in t[2]['warning']] if 'missingEntity' in t[2] or 'obsoleteEntity' in t[2]: missingEntities = ('missingEntity' in t[2] and t[2]['missingEntity']) or [] obsoleteEntities = ('obsoleteEntity' in t[2] and t[2]['obsoleteEntity']) or [] entities = missingEntities + obsoleteEntities entities.sort() for entity in entities: op = '+' if entity in obsoleteEntities: op = '-' o.append(indent + op + entity) elif 'missingFile' in t[2]: o.append(indent + '// add and localize this file') elif 'obsoleteFile' in t[2]: o.append(indent + '// remove this file') return '\n'.join(o) out = [] for locale, summary in sorted(self.summary.iteritems()): if locale is not None: out.append(locale + ':') out += [k + ': ' + str(v) for k, v in sorted(summary.iteritems())] total = sum([summary[k] for k in ['changed', 'unchanged', 'report', 'missing', 'missingInFiles'] if k in summary]) rate = 0 if total: rate = (('changed' in summary and summary['changed'] * 100) or 0) / total out.append('%d%% of entries changed' % rate) return '\n'.join(map(tostr, self.details.getContent()) + out) def __str__(self): return 'observer' class ContentComparer: keyRE = re.compile('[kK]ey') nl = re.compile('\n', re.M) def __init__(self): '''Create a ContentComparer. observer is usually a instance of Observer. The return values of the notify method are used to control the handling of missing entities. ''' self.reference = dict() self.observer = Observer() self.other_observers = [] self.merge_stage = None def add_observer(self, obs): '''Add a non-filtering observer. Results from the notify calls are ignored. ''' self.other_observers.append(obs) def set_merge_stage(self, merge_stage): self.merge_stage = merge_stage def merge(self, ref_entities, ref_map, ref_file, l10n_file, missing, skips, p): outfile = os.path.join(self.merge_stage, l10n_file.module, l10n_file.file) outdir = os.path.dirname(outfile) if not os.path.isdir(outdir): os.makedirs(outdir) if not p.canMerge: shutil.copyfile(ref_file.fullpath, outfile) print "copied reference to " + outfile return if skips: # skips come in ordered by key name, we need them in file order skips.sort(key=lambda s: s.span[0]) trailing = (['\n'] + [ref_entities[ref_map[key]].all for key in missing] + [ref_entities[ref_map[skip.key]].all for skip in skips if not isinstance(skip, parser.Junk)]) if skips: # we need to skip a few errornous blocks in the input, copy by hand f = codecs.open(outfile, 'wb', p.encoding) offset = 0 for skip in skips: chunk = skip.span f.write(p.contents[offset:chunk[0]]) offset = chunk[1] f.write(p.contents[offset:]) else: shutil.copyfile(l10n_file.fullpath, outfile) f = codecs.open(outfile, 'ab', p.encoding) print "adding to " + outfile def ensureNewline(s): if not s.endswith('\n'): return s + '\n' return s f.write(''.join(map(ensureNewline, trailing))) f.close() def notify(self, category, file, data): """Check observer for the found data, and if it's not to ignore, notify other_observers. """ rv = self.observer.notify(category, file, data) if rv == 'ignore': return rv for obs in self.other_observers: # non-filtering other_observers, ignore results obs.notify(category, file, data) return rv def remove(self, obsolete): self.notify('obsoleteFile', obsolete, None) pass def compare(self, ref_file, l10n): try: p = parser.getParser(ref_file.file) except UserWarning: # no comparison, XXX report? return if ref_file not in self.reference: # we didn't parse this before try: p.readContents(ref_file.getContents()) except Exception, e: self.notify('error', ref_file, str(e)) return self.reference[ref_file] = p.parse() ref = self.reference[ref_file] ref_list = ref[1].keys() ref_list.sort() try: p.readContents(l10n.getContents()) l10n_entities, l10n_map = p.parse() except Exception, e: self.notify('error', l10n, str(e)) return lines = [] def _getLine(offset): if not lines: lines.append(0) for m in self.nl.finditer(p.contents): lines.append(m.end()) for i in xrange(len(lines), 0, -1): if offset >= lines[i - 1]: return (i, offset - lines[i - 1]) return (1, offset) l10n_list = l10n_map.keys() l10n_list.sort() ar = AddRemove() ar.set_left(ref_list) ar.set_right(l10n_list) report = missing = obsolete = changed = unchanged = keys = 0 missings = [] skips = [] checker = getChecker(l10n, reference=ref[0]) for action, item_or_pair in ar: if action == 'delete': # missing entity _rv = self.notify('missingEntity', l10n, item_or_pair) if _rv == "ignore": continue if _rv == "error": # only add to missing entities for l10n-merge on error, # not report missings.append(item_or_pair) missing += 1 else: # just report report += 1 elif action == 'add': # obsolete entity or junk if isinstance(l10n_entities[l10n_map[item_or_pair]], parser.Junk): junk = l10n_entities[l10n_map[item_or_pair]] params = (junk.val,) + junk.span self.notify('error', l10n, 'Unparsed content "%s" at %d-%d' % params) if self.merge_stage is not None: skips.append(junk) elif self.notify('obsoleteEntity', l10n, item_or_pair) != 'ignore': obsolete += 1 else: # entity found in both ref and l10n, check for changed entity = item_or_pair[0] refent = ref[0][ref[1][entity]] l10nent = l10n_entities[l10n_map[entity]] if self.keyRE.search(entity): keys += 1 else: if refent.val == l10nent.val: self.doUnchanged(l10nent) unchanged += 1 else: self.doChanged(ref_file, refent, l10nent) changed += 1 # run checks: if checker: for tp, pos, msg, cat in checker.check(refent, l10nent): # compute real src position, if first line, # col needs adjustment _l, _offset = _getLine(l10nent.val_span[0]) if isinstance(pos, tuple): # line, column if pos[0] == 1: col = pos[1] + _offset else: col = pos[1] _l += pos[0] - 1 else: _l, col = _getLine(l10nent.val_span[0] + pos) # skip error entities when merging if tp == 'error' and self.merge_stage is not None: skips.append(l10nent) self.notify(tp, l10n, u"%s at line %d, column %d for %s" % (msg, _l, col, refent.key)) pass if missing: self.notify('missing', l10n, missing) if self.merge_stage is not None and (missings or skips): self.merge(ref[0], ref[1], ref_file, l10n, missings, skips, p) if report: self.notify('report', l10n, report) if obsolete: self.notify('obsolete', l10n, obsolete) if changed: self.notify('changed', l10n, changed) if unchanged: self.notify('unchanged', l10n, unchanged) if keys: self.notify('keys', l10n, keys) pass def add(self, orig, missing): if self.notify('missingFile', missing, None) == "ignore": # filter said that we don't need this file, don't count it return f = orig try: p = parser.getParser(f.file) except UserWarning: return try: p.readContents(f.getContents()) entities, map = p.parse() except Exception, e: self.notify('error', f, str(e)) return self.notify('missingInFiles', missing, len(map)) def doUnchanged(self, entity): # overload this if needed pass def doChanged(self, file, ref_entity, l10n_entity): # overload this if needed pass def compareApp(app, other_observer=None, merge_stage=None, clobber=False): '''Compare locales set in app. Optional arguments are: - other_observer. A object implementing notify(category, _file, data) The return values of that callback are ignored. - merge_stage. A directory to be used for staging the output of l10n-merge. - clobber. Clobber the module subdirectories of the merge dir as we go. Use wisely, as it might cause data loss. ''' comparer = ContentComparer() if other_observer is not None: comparer.add_observer(other_observer) comparer.observer.filter = app.filter for module, reference, locales in app: dir_comp = DirectoryCompare(reference) dir_comp.setWatcher(comparer) for _, localization in locales: if merge_stage is not None: locale_merge = merge_stage.format(ab_CD=localization.locale) comparer.set_merge_stage(locale_merge) if clobber: # if clobber, remove the stage for the module if it exists clobberdir = os.path.join(locale_merge, module) if os.path.exists(clobberdir): shutil.rmtree(clobberdir) print "clobbered " + clobberdir dir_comp.compareWith(localization) return comparer.observer def compareDirs(reference, locale, other_observer=None, merge_stage=None): '''Compare reference and locale dir. Optional arguments are: - other_observer. A object implementing notify(category, _file, data) The return values of that callback are ignored. ''' comparer = ContentComparer() if other_observer is not None: comparer.add_observer(other_observer) comparer.set_merge_stage(merge_stage) dir_comp = DirectoryCompare(paths.EnumerateDir(reference)) dir_comp.setWatcher(comparer) dir_comp.compareWith(paths.EnumerateDir(locale)) return comparer.observer