diff options
Diffstat (limited to 'python/compare-locales/compare_locales/checks.py')
-rw-r--r-- | python/compare-locales/compare_locales/checks.py | 438 |
1 files changed, 438 insertions, 0 deletions
diff --git a/python/compare-locales/compare_locales/checks.py b/python/compare-locales/compare_locales/checks.py new file mode 100644 index 000000000..ee3bef03d --- /dev/null +++ b/python/compare-locales/compare_locales/checks.py @@ -0,0 +1,438 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import re +from difflib import SequenceMatcher +from xml import sax +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + +from compare_locales.parser import DTDParser, PropertiesParser + + +class Checker(object): + '''Abstract class to implement checks per file type. + ''' + pattern = None + + @classmethod + def use(cls, file): + return cls.pattern.match(file.file) + + def check(self, refEnt, l10nEnt): + '''Given the reference and localized Entities, performs checks. + + This is a generator yielding tuples of + - "warning" or "error", depending on what should be reported, + - tuple of line, column info for the error within the string + - description string to be shown in the report + ''' + if True: + raise NotImplementedError("Need to subclass") + yield ("error", (0, 0), "This is an example error", "example") + + +class PrintfException(Exception): + def __init__(self, msg, pos): + self.pos = pos + self.msg = msg + + +class PropertiesChecker(Checker): + '''Tests to run on .properties files. + ''' + pattern = re.compile('.*\.properties$') + printf = re.compile(r'%(?P<good>%|' + r'(?:(?P<number>[1-9][0-9]*)\$)?' + r'(?P<width>\*|[0-9]+)?' + r'(?P<prec>\.(?:\*|[0-9]+)?)?' + r'(?P<spec>[duxXosScpfg]))?') + + def check(self, refEnt, l10nEnt): + '''Test for the different variable formats. + ''' + refValue, l10nValue = refEnt.val, l10nEnt.val + refSpecs = None + # check for PluralForm.jsm stuff, should have the docs in the + # comment + if 'Localization_and_Plurals' in refEnt.pre_comment: + # For plurals, common variable pattern is #1. Try that. + pats = set(int(m.group(1)) for m in re.finditer('#([0-9]+)', + refValue)) + if len(pats) == 0: + return + lpats = set(int(m.group(1)) for m in re.finditer('#([0-9]+)', + l10nValue)) + if pats - lpats: + yield ('warning', 0, 'not all variables used in l10n', + 'plural') + return + if lpats - pats: + yield ('error', 0, 'unreplaced variables in l10n', + 'plural') + return + return + # check for lost escapes + raw_val = l10nEnt.raw_val + for m in PropertiesParser.escape.finditer(raw_val): + if m.group('single') and \ + m.group('single') not in PropertiesParser.known_escapes: + yield ('warning', m.start(), + 'unknown escape sequence, \\' + m.group('single'), + 'escape') + try: + refSpecs = self.getPrintfSpecs(refValue) + except PrintfException: + refSpecs = [] + if refSpecs: + for t in self.checkPrintf(refSpecs, l10nValue): + yield t + return + + def checkPrintf(self, refSpecs, l10nValue): + try: + l10nSpecs = self.getPrintfSpecs(l10nValue) + except PrintfException, e: + yield ('error', e.pos, e.msg, 'printf') + return + if refSpecs != l10nSpecs: + sm = SequenceMatcher() + sm.set_seqs(refSpecs, l10nSpecs) + msgs = [] + warn = None + for action, i1, i2, j1, j2 in sm.get_opcodes(): + if action == 'equal': + continue + if action == 'delete': + # missing argument in l10n + if i2 == len(refSpecs): + # trailing specs missing, that's just a warning + warn = ', '.join('trailing argument %d `%s` missing' % + (i+1, refSpecs[i]) + for i in xrange(i1, i2)) + else: + for i in xrange(i1, i2): + msgs.append('argument %d `%s` missing' % + (i+1, refSpecs[i])) + continue + if action == 'insert': + # obsolete argument in l10n + for i in xrange(j1, j2): + msgs.append('argument %d `%s` obsolete' % + (i+1, l10nSpecs[i])) + continue + if action == 'replace': + for i, j in zip(xrange(i1, i2), xrange(j1, j2)): + msgs.append('argument %d `%s` should be `%s`' % + (j+1, l10nSpecs[j], refSpecs[i])) + if msgs: + yield ('error', 0, ', '.join(msgs), 'printf') + if warn is not None: + yield ('warning', 0, warn, 'printf') + + def getPrintfSpecs(self, val): + hasNumber = False + specs = [] + for m in self.printf.finditer(val): + if m.group("good") is None: + # found just a '%', signal an error + raise PrintfException('Found single %', m.start()) + if m.group("good") == '%': + # escaped % + continue + if ((hasNumber and m.group('number') is None) or + (not hasNumber and specs and + m.group('number') is not None)): + # mixed style, numbered and not + raise PrintfException('Mixed ordered and non-ordered args', + m.start()) + hasNumber = m.group('number') is not None + if hasNumber: + pos = int(m.group('number')) - 1 + ls = len(specs) + if pos >= ls: + # pad specs + nones = pos - ls + specs[ls:pos] = nones*[None] + specs.append(m.group('spec')) + else: + if specs[pos] is not None: + raise PrintfException('Double ordered argument %d' % + (pos+1), + m.start()) + specs[pos] = m.group('spec') + else: + specs.append(m.group('spec')) + # check for missing args + if hasNumber and not all(specs): + raise PrintfException('Ordered argument missing', 0) + return specs + + +class DTDChecker(Checker): + """Tests to run on DTD files. + + Uses xml.sax for the heavy lifting of xml parsing. + + The code tries to parse until it doesn't find any unresolved entities + anymore. If it finds one, it tries to grab the key, and adds an empty + <!ENTITY key ""> definition to the header. + + Also checks for some CSS and number heuristics in the values. + """ + pattern = re.compile('.*\.dtd$') + + eref = re.compile('&(%s);' % DTDParser.Name) + tmpl = '''<!DOCTYPE elem [%s]> +<elem>%s</elem> +''' + xmllist = set(('amp', 'lt', 'gt', 'apos', 'quot')) + + def __init__(self, reference): + self.reference = reference + self.__known_entities = None + + def known_entities(self, refValue): + if self.__known_entities is None and self.reference is not None: + self.__known_entities = set() + for ent in self.reference: + self.__known_entities.update(self.entities_for_value(ent.val)) + return self.__known_entities if self.__known_entities is not None \ + else self.entities_for_value(refValue) + + def entities_for_value(self, value): + reflist = set(m.group(1).encode('utf-8') + for m in self.eref.finditer(value)) + reflist -= self.xmllist + return reflist + + # Setup for XML parser, with default and text-only content handler + class TextContent(sax.handler.ContentHandler): + textcontent = '' + + def characters(self, content): + self.textcontent += content + + defaulthandler = sax.handler.ContentHandler() + texthandler = TextContent() + + numPattern = r'([0-9]+|[0-9]*\.[0-9]+)' + num = re.compile('^%s$' % numPattern) + lengthPattern = '%s(em|px|ch|cm|in)' % numPattern + length = re.compile('^%s$' % lengthPattern) + spec = re.compile(r'((?:min\-)?(?:width|height))\s*:\s*%s' % + lengthPattern) + style = re.compile(r'^%(spec)s\s*(;\s*%(spec)s\s*)*;?$' % + {'spec': spec.pattern}) + + processContent = None + + def check(self, refEnt, l10nEnt): + """Try to parse the refvalue inside a dummy element, and keep + track of entities that we need to define to make that work. + + Return a checker that offers just those entities. + """ + refValue, l10nValue = refEnt.val, l10nEnt.val + # find entities the refValue references, + # reusing markup from DTDParser. + reflist = self.known_entities(refValue) + inContext = self.entities_for_value(refValue) + entities = ''.join('<!ENTITY %s "">' % s for s in sorted(reflist)) + parser = sax.make_parser() + parser.setFeature(sax.handler.feature_external_ges, False) + + parser.setContentHandler(self.defaulthandler) + try: + parser.parse(StringIO(self.tmpl % + (entities, refValue.encode('utf-8')))) + # also catch stray % + parser.parse(StringIO(self.tmpl % + (refEnt.all.encode('utf-8') + entities, + '&%s;' % refEnt.key.encode('utf-8')))) + except sax.SAXParseException, e: + yield ('warning', + (0, 0), + "can't parse en-US value", 'xmlparse') + + # find entities the l10nValue references, + # reusing markup from DTDParser. + l10nlist = self.entities_for_value(l10nValue) + missing = sorted(l10nlist - reflist) + _entities = entities + ''.join('<!ENTITY %s "">' % s for s in missing) + if self.processContent is not None: + self.texthandler.textcontent = '' + parser.setContentHandler(self.texthandler) + try: + parser.parse(StringIO(self.tmpl % (_entities, + l10nValue.encode('utf-8')))) + # also catch stray % + # if this fails, we need to substract the entity definition + parser.setContentHandler(self.defaulthandler) + parser.parse(StringIO(self.tmpl % ( + l10nEnt.all.encode('utf-8') + _entities, + '&%s;' % l10nEnt.key.encode('utf-8')))) + except sax.SAXParseException, e: + # xml parse error, yield error + # sometimes, the error is reported on our fake closing + # element, make that the end of the last line + lnr = e.getLineNumber() - 1 + lines = l10nValue.splitlines() + if lnr > len(lines): + lnr = len(lines) + col = len(lines[lnr-1]) + else: + col = e.getColumnNumber() + if lnr == 1: + # first line starts with <elem>, substract + col -= len("<elem>") + elif lnr == 0: + col -= len("<!DOCTYPE elem [") # first line is DOCTYPE + yield ('error', (lnr, col), ' '.join(e.args), 'xmlparse') + + warntmpl = u'Referencing unknown entity `%s`' + if reflist: + if inContext: + elsewhere = reflist - inContext + warntmpl += ' (%s used in context' % \ + ', '.join(sorted(inContext)) + if elsewhere: + warntmpl += ', %s known)' % ', '.join(sorted(elsewhere)) + else: + warntmpl += ')' + else: + warntmpl += ' (%s known)' % ', '.join(sorted(reflist)) + for key in missing: + yield ('warning', (0, 0), warntmpl % key.decode('utf-8'), + 'xmlparse') + if inContext and l10nlist and l10nlist - inContext - set(missing): + mismatch = sorted(l10nlist - inContext - set(missing)) + for key in mismatch: + yield ('warning', (0, 0), + 'Entity %s referenced, but %s used in context' % ( + key.decode('utf-8'), + ', '.join(sorted(inContext)) + ), 'xmlparse') + + # Number check + if self.num.match(refValue) and not self.num.match(l10nValue): + yield ('warning', 0, 'reference is a number', 'number') + # CSS checks + # just a length, width="100em" + if self.length.match(refValue) and not self.length.match(l10nValue): + yield ('error', 0, 'reference is a CSS length', 'css') + # real CSS spec, style="width:100px;" + if self.style.match(refValue): + if not self.style.match(l10nValue): + yield ('error', 0, 'reference is a CSS spec', 'css') + else: + # warn if different properties or units + refMap = dict((s, u) for s, _, u in + self.spec.findall(refValue)) + msgs = [] + for s, _, u in self.spec.findall(l10nValue): + if s not in refMap: + msgs.insert(0, '%s only in l10n' % s) + continue + else: + ru = refMap.pop(s) + if u != ru: + msgs.append("units for %s don't match " + "(%s != %s)" % (s, u, ru)) + for s in refMap.iterkeys(): + msgs.insert(0, '%s only in reference' % s) + if msgs: + yield ('warning', 0, ', '.join(msgs), 'css') + + if self.processContent is not None: + for t in self.processContent(self.texthandler.textcontent): + yield t + + +class PrincessAndroid(DTDChecker): + """Checker for the string values that Android puts into an XML container. + + http://developer.android.com/guide/topics/resources/string-resource.html#FormattingAndStyling # noqa + has more info. Check for unescaped apostrophes and bad unicode escapes. + """ + quoted = re.compile("(?P<q>[\"']).*(?P=q)$") + + def unicode_escape(self, str): + """Helper method to try to decode all unicode escapes in a string. + + This code uses the standard python decode for unicode-escape, but + that's somewhat tricky, as its input needs to be ascii. To get to + ascii, the unicode string gets converted to ascii with + backslashreplace, i.e., all non-ascii unicode chars get unicode + escaped. And then we try to roll all of that back. + Now, when that hits an error, that's from the original string, and we + need to search for the actual error position in the original string, + as the backslashreplace code changes string positions quite badly. + See also the last check in TestAndroid.test_android_dtd, with a + lengthy chinese string. + """ + val = str.encode('ascii', 'backslashreplace') + try: + val.decode('unicode-escape') + except UnicodeDecodeError, e: + args = list(e.args) + badstring = args[1][args[2]:args[3]] + i = len(args[1][:args[2]].decode('unicode-escape')) + args[2] = i + args[3] = i + len(badstring) + raise UnicodeDecodeError(*args) + + @classmethod + def use(cls, file): + """Use this Checker only for DTD files in embedding/android.""" + return (file.module in ("embedding/android", + "mobile/android/base") and + cls.pattern.match(file.file)) + + def processContent(self, val): + """Actual check code. + Check for unicode escapes and unescaped quotes and apostrophes, + if string's not quoted. + """ + # first, try to decode unicode escapes + try: + self.unicode_escape(val) + except UnicodeDecodeError, e: + yield ('error', e.args[2], e.args[4], 'android') + # check for unescaped single or double quotes. + # first, see if the complete string is single or double quoted, + # that changes the rules + m = self.quoted.match(val) + if m: + q = m.group('q') + offset = 0 + val = val[1:-1] # strip quotes + else: + q = "[\"']" + offset = -1 + stray_quot = re.compile(r"[\\\\]*(%s)" % q) + + for m in stray_quot.finditer(val): + if len(m.group(0)) % 2: + # found an unescaped single or double quote, which message? + if m.group(1) == '"': + msg = u"Quotes in Android DTDs need escaping with \\\" "\ + u"or \\u0022, or put string in apostrophes." + else: + msg = u"Apostrophes in Android DTDs need escaping with "\ + u"\\' or \\u0027, or use \u2019, or put string in "\ + u"quotes." + yield ('error', m.end(0)+offset, msg, 'android') + + +def getChecker(file, reference=None): + if PropertiesChecker.use(file): + return PropertiesChecker() + if PrincessAndroid.use(file): + return PrincessAndroid(reference) + if DTDChecker.use(file): + return DTDChecker(reference) + return None |