summaryrefslogtreecommitdiffstats
path: root/python/compare-locales/compare_locales/checks.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/compare-locales/compare_locales/checks.py')
-rw-r--r--python/compare-locales/compare_locales/checks.py438
1 files changed, 438 insertions, 0 deletions
diff --git a/python/compare-locales/compare_locales/checks.py b/python/compare-locales/compare_locales/checks.py
new file mode 100644
index 000000000..ee3bef03d
--- /dev/null
+++ b/python/compare-locales/compare_locales/checks.py
@@ -0,0 +1,438 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import re
+from difflib import SequenceMatcher
+from xml import sax
+try:
+ from cStringIO import StringIO
+except ImportError:
+ from StringIO import StringIO
+
+from compare_locales.parser import DTDParser, PropertiesParser
+
+
+class Checker(object):
+ '''Abstract class to implement checks per file type.
+ '''
+ pattern = None
+
+ @classmethod
+ def use(cls, file):
+ return cls.pattern.match(file.file)
+
+ def check(self, refEnt, l10nEnt):
+ '''Given the reference and localized Entities, performs checks.
+
+ This is a generator yielding tuples of
+ - "warning" or "error", depending on what should be reported,
+ - tuple of line, column info for the error within the string
+ - description string to be shown in the report
+ '''
+ if True:
+ raise NotImplementedError("Need to subclass")
+ yield ("error", (0, 0), "This is an example error", "example")
+
+
+class PrintfException(Exception):
+ def __init__(self, msg, pos):
+ self.pos = pos
+ self.msg = msg
+
+
+class PropertiesChecker(Checker):
+ '''Tests to run on .properties files.
+ '''
+ pattern = re.compile('.*\.properties$')
+ printf = re.compile(r'%(?P<good>%|'
+ r'(?:(?P<number>[1-9][0-9]*)\$)?'
+ r'(?P<width>\*|[0-9]+)?'
+ r'(?P<prec>\.(?:\*|[0-9]+)?)?'
+ r'(?P<spec>[duxXosScpfg]))?')
+
+ def check(self, refEnt, l10nEnt):
+ '''Test for the different variable formats.
+ '''
+ refValue, l10nValue = refEnt.val, l10nEnt.val
+ refSpecs = None
+ # check for PluralForm.jsm stuff, should have the docs in the
+ # comment
+ if 'Localization_and_Plurals' in refEnt.pre_comment:
+ # For plurals, common variable pattern is #1. Try that.
+ pats = set(int(m.group(1)) for m in re.finditer('#([0-9]+)',
+ refValue))
+ if len(pats) == 0:
+ return
+ lpats = set(int(m.group(1)) for m in re.finditer('#([0-9]+)',
+ l10nValue))
+ if pats - lpats:
+ yield ('warning', 0, 'not all variables used in l10n',
+ 'plural')
+ return
+ if lpats - pats:
+ yield ('error', 0, 'unreplaced variables in l10n',
+ 'plural')
+ return
+ return
+ # check for lost escapes
+ raw_val = l10nEnt.raw_val
+ for m in PropertiesParser.escape.finditer(raw_val):
+ if m.group('single') and \
+ m.group('single') not in PropertiesParser.known_escapes:
+ yield ('warning', m.start(),
+ 'unknown escape sequence, \\' + m.group('single'),
+ 'escape')
+ try:
+ refSpecs = self.getPrintfSpecs(refValue)
+ except PrintfException:
+ refSpecs = []
+ if refSpecs:
+ for t in self.checkPrintf(refSpecs, l10nValue):
+ yield t
+ return
+
+ def checkPrintf(self, refSpecs, l10nValue):
+ try:
+ l10nSpecs = self.getPrintfSpecs(l10nValue)
+ except PrintfException, e:
+ yield ('error', e.pos, e.msg, 'printf')
+ return
+ if refSpecs != l10nSpecs:
+ sm = SequenceMatcher()
+ sm.set_seqs(refSpecs, l10nSpecs)
+ msgs = []
+ warn = None
+ for action, i1, i2, j1, j2 in sm.get_opcodes():
+ if action == 'equal':
+ continue
+ if action == 'delete':
+ # missing argument in l10n
+ if i2 == len(refSpecs):
+ # trailing specs missing, that's just a warning
+ warn = ', '.join('trailing argument %d `%s` missing' %
+ (i+1, refSpecs[i])
+ for i in xrange(i1, i2))
+ else:
+ for i in xrange(i1, i2):
+ msgs.append('argument %d `%s` missing' %
+ (i+1, refSpecs[i]))
+ continue
+ if action == 'insert':
+ # obsolete argument in l10n
+ for i in xrange(j1, j2):
+ msgs.append('argument %d `%s` obsolete' %
+ (i+1, l10nSpecs[i]))
+ continue
+ if action == 'replace':
+ for i, j in zip(xrange(i1, i2), xrange(j1, j2)):
+ msgs.append('argument %d `%s` should be `%s`' %
+ (j+1, l10nSpecs[j], refSpecs[i]))
+ if msgs:
+ yield ('error', 0, ', '.join(msgs), 'printf')
+ if warn is not None:
+ yield ('warning', 0, warn, 'printf')
+
+ def getPrintfSpecs(self, val):
+ hasNumber = False
+ specs = []
+ for m in self.printf.finditer(val):
+ if m.group("good") is None:
+ # found just a '%', signal an error
+ raise PrintfException('Found single %', m.start())
+ if m.group("good") == '%':
+ # escaped %
+ continue
+ if ((hasNumber and m.group('number') is None) or
+ (not hasNumber and specs and
+ m.group('number') is not None)):
+ # mixed style, numbered and not
+ raise PrintfException('Mixed ordered and non-ordered args',
+ m.start())
+ hasNumber = m.group('number') is not None
+ if hasNumber:
+ pos = int(m.group('number')) - 1
+ ls = len(specs)
+ if pos >= ls:
+ # pad specs
+ nones = pos - ls
+ specs[ls:pos] = nones*[None]
+ specs.append(m.group('spec'))
+ else:
+ if specs[pos] is not None:
+ raise PrintfException('Double ordered argument %d' %
+ (pos+1),
+ m.start())
+ specs[pos] = m.group('spec')
+ else:
+ specs.append(m.group('spec'))
+ # check for missing args
+ if hasNumber and not all(specs):
+ raise PrintfException('Ordered argument missing', 0)
+ return specs
+
+
+class DTDChecker(Checker):
+ """Tests to run on DTD files.
+
+ Uses xml.sax for the heavy lifting of xml parsing.
+
+ The code tries to parse until it doesn't find any unresolved entities
+ anymore. If it finds one, it tries to grab the key, and adds an empty
+ <!ENTITY key ""> definition to the header.
+
+ Also checks for some CSS and number heuristics in the values.
+ """
+ pattern = re.compile('.*\.dtd$')
+
+ eref = re.compile('&(%s);' % DTDParser.Name)
+ tmpl = '''<!DOCTYPE elem [%s]>
+<elem>%s</elem>
+'''
+ xmllist = set(('amp', 'lt', 'gt', 'apos', 'quot'))
+
+ def __init__(self, reference):
+ self.reference = reference
+ self.__known_entities = None
+
+ def known_entities(self, refValue):
+ if self.__known_entities is None and self.reference is not None:
+ self.__known_entities = set()
+ for ent in self.reference:
+ self.__known_entities.update(self.entities_for_value(ent.val))
+ return self.__known_entities if self.__known_entities is not None \
+ else self.entities_for_value(refValue)
+
+ def entities_for_value(self, value):
+ reflist = set(m.group(1).encode('utf-8')
+ for m in self.eref.finditer(value))
+ reflist -= self.xmllist
+ return reflist
+
+ # Setup for XML parser, with default and text-only content handler
+ class TextContent(sax.handler.ContentHandler):
+ textcontent = ''
+
+ def characters(self, content):
+ self.textcontent += content
+
+ defaulthandler = sax.handler.ContentHandler()
+ texthandler = TextContent()
+
+ numPattern = r'([0-9]+|[0-9]*\.[0-9]+)'
+ num = re.compile('^%s$' % numPattern)
+ lengthPattern = '%s(em|px|ch|cm|in)' % numPattern
+ length = re.compile('^%s$' % lengthPattern)
+ spec = re.compile(r'((?:min\-)?(?:width|height))\s*:\s*%s' %
+ lengthPattern)
+ style = re.compile(r'^%(spec)s\s*(;\s*%(spec)s\s*)*;?$' %
+ {'spec': spec.pattern})
+
+ processContent = None
+
+ def check(self, refEnt, l10nEnt):
+ """Try to parse the refvalue inside a dummy element, and keep
+ track of entities that we need to define to make that work.
+
+ Return a checker that offers just those entities.
+ """
+ refValue, l10nValue = refEnt.val, l10nEnt.val
+ # find entities the refValue references,
+ # reusing markup from DTDParser.
+ reflist = self.known_entities(refValue)
+ inContext = self.entities_for_value(refValue)
+ entities = ''.join('<!ENTITY %s "">' % s for s in sorted(reflist))
+ parser = sax.make_parser()
+ parser.setFeature(sax.handler.feature_external_ges, False)
+
+ parser.setContentHandler(self.defaulthandler)
+ try:
+ parser.parse(StringIO(self.tmpl %
+ (entities, refValue.encode('utf-8'))))
+ # also catch stray %
+ parser.parse(StringIO(self.tmpl %
+ (refEnt.all.encode('utf-8') + entities,
+ '&%s;' % refEnt.key.encode('utf-8'))))
+ except sax.SAXParseException, e:
+ yield ('warning',
+ (0, 0),
+ "can't parse en-US value", 'xmlparse')
+
+ # find entities the l10nValue references,
+ # reusing markup from DTDParser.
+ l10nlist = self.entities_for_value(l10nValue)
+ missing = sorted(l10nlist - reflist)
+ _entities = entities + ''.join('<!ENTITY %s "">' % s for s in missing)
+ if self.processContent is not None:
+ self.texthandler.textcontent = ''
+ parser.setContentHandler(self.texthandler)
+ try:
+ parser.parse(StringIO(self.tmpl % (_entities,
+ l10nValue.encode('utf-8'))))
+ # also catch stray %
+ # if this fails, we need to substract the entity definition
+ parser.setContentHandler(self.defaulthandler)
+ parser.parse(StringIO(self.tmpl % (
+ l10nEnt.all.encode('utf-8') + _entities,
+ '&%s;' % l10nEnt.key.encode('utf-8'))))
+ except sax.SAXParseException, e:
+ # xml parse error, yield error
+ # sometimes, the error is reported on our fake closing
+ # element, make that the end of the last line
+ lnr = e.getLineNumber() - 1
+ lines = l10nValue.splitlines()
+ if lnr > len(lines):
+ lnr = len(lines)
+ col = len(lines[lnr-1])
+ else:
+ col = e.getColumnNumber()
+ if lnr == 1:
+ # first line starts with <elem>, substract
+ col -= len("<elem>")
+ elif lnr == 0:
+ col -= len("<!DOCTYPE elem [") # first line is DOCTYPE
+ yield ('error', (lnr, col), ' '.join(e.args), 'xmlparse')
+
+ warntmpl = u'Referencing unknown entity `%s`'
+ if reflist:
+ if inContext:
+ elsewhere = reflist - inContext
+ warntmpl += ' (%s used in context' % \
+ ', '.join(sorted(inContext))
+ if elsewhere:
+ warntmpl += ', %s known)' % ', '.join(sorted(elsewhere))
+ else:
+ warntmpl += ')'
+ else:
+ warntmpl += ' (%s known)' % ', '.join(sorted(reflist))
+ for key in missing:
+ yield ('warning', (0, 0), warntmpl % key.decode('utf-8'),
+ 'xmlparse')
+ if inContext and l10nlist and l10nlist - inContext - set(missing):
+ mismatch = sorted(l10nlist - inContext - set(missing))
+ for key in mismatch:
+ yield ('warning', (0, 0),
+ 'Entity %s referenced, but %s used in context' % (
+ key.decode('utf-8'),
+ ', '.join(sorted(inContext))
+ ), 'xmlparse')
+
+ # Number check
+ if self.num.match(refValue) and not self.num.match(l10nValue):
+ yield ('warning', 0, 'reference is a number', 'number')
+ # CSS checks
+ # just a length, width="100em"
+ if self.length.match(refValue) and not self.length.match(l10nValue):
+ yield ('error', 0, 'reference is a CSS length', 'css')
+ # real CSS spec, style="width:100px;"
+ if self.style.match(refValue):
+ if not self.style.match(l10nValue):
+ yield ('error', 0, 'reference is a CSS spec', 'css')
+ else:
+ # warn if different properties or units
+ refMap = dict((s, u) for s, _, u in
+ self.spec.findall(refValue))
+ msgs = []
+ for s, _, u in self.spec.findall(l10nValue):
+ if s not in refMap:
+ msgs.insert(0, '%s only in l10n' % s)
+ continue
+ else:
+ ru = refMap.pop(s)
+ if u != ru:
+ msgs.append("units for %s don't match "
+ "(%s != %s)" % (s, u, ru))
+ for s in refMap.iterkeys():
+ msgs.insert(0, '%s only in reference' % s)
+ if msgs:
+ yield ('warning', 0, ', '.join(msgs), 'css')
+
+ if self.processContent is not None:
+ for t in self.processContent(self.texthandler.textcontent):
+ yield t
+
+
+class PrincessAndroid(DTDChecker):
+ """Checker for the string values that Android puts into an XML container.
+
+ http://developer.android.com/guide/topics/resources/string-resource.html#FormattingAndStyling # noqa
+ has more info. Check for unescaped apostrophes and bad unicode escapes.
+ """
+ quoted = re.compile("(?P<q>[\"']).*(?P=q)$")
+
+ def unicode_escape(self, str):
+ """Helper method to try to decode all unicode escapes in a string.
+
+ This code uses the standard python decode for unicode-escape, but
+ that's somewhat tricky, as its input needs to be ascii. To get to
+ ascii, the unicode string gets converted to ascii with
+ backslashreplace, i.e., all non-ascii unicode chars get unicode
+ escaped. And then we try to roll all of that back.
+ Now, when that hits an error, that's from the original string, and we
+ need to search for the actual error position in the original string,
+ as the backslashreplace code changes string positions quite badly.
+ See also the last check in TestAndroid.test_android_dtd, with a
+ lengthy chinese string.
+ """
+ val = str.encode('ascii', 'backslashreplace')
+ try:
+ val.decode('unicode-escape')
+ except UnicodeDecodeError, e:
+ args = list(e.args)
+ badstring = args[1][args[2]:args[3]]
+ i = len(args[1][:args[2]].decode('unicode-escape'))
+ args[2] = i
+ args[3] = i + len(badstring)
+ raise UnicodeDecodeError(*args)
+
+ @classmethod
+ def use(cls, file):
+ """Use this Checker only for DTD files in embedding/android."""
+ return (file.module in ("embedding/android",
+ "mobile/android/base") and
+ cls.pattern.match(file.file))
+
+ def processContent(self, val):
+ """Actual check code.
+ Check for unicode escapes and unescaped quotes and apostrophes,
+ if string's not quoted.
+ """
+ # first, try to decode unicode escapes
+ try:
+ self.unicode_escape(val)
+ except UnicodeDecodeError, e:
+ yield ('error', e.args[2], e.args[4], 'android')
+ # check for unescaped single or double quotes.
+ # first, see if the complete string is single or double quoted,
+ # that changes the rules
+ m = self.quoted.match(val)
+ if m:
+ q = m.group('q')
+ offset = 0
+ val = val[1:-1] # strip quotes
+ else:
+ q = "[\"']"
+ offset = -1
+ stray_quot = re.compile(r"[\\\\]*(%s)" % q)
+
+ for m in stray_quot.finditer(val):
+ if len(m.group(0)) % 2:
+ # found an unescaped single or double quote, which message?
+ if m.group(1) == '"':
+ msg = u"Quotes in Android DTDs need escaping with \\\" "\
+ u"or \\u0022, or put string in apostrophes."
+ else:
+ msg = u"Apostrophes in Android DTDs need escaping with "\
+ u"\\' or \\u0027, or use \u2019, or put string in "\
+ u"quotes."
+ yield ('error', m.end(0)+offset, msg, 'android')
+
+
+def getChecker(file, reference=None):
+ if PropertiesChecker.use(file):
+ return PropertiesChecker()
+ if PrincessAndroid.use(file):
+ return PrincessAndroid(reference)
+ if DTDChecker.use(file):
+ return DTDChecker(reference)
+ return None