diff options
Diffstat (limited to 'testing/web-platform/tests/tools/html5lib/utils/entities.py')
-rw-r--r-- | testing/web-platform/tests/tools/html5lib/utils/entities.py | 88 |
1 files changed, 88 insertions, 0 deletions
diff --git a/testing/web-platform/tests/tools/html5lib/utils/entities.py b/testing/web-platform/tests/tools/html5lib/utils/entities.py new file mode 100644 index 000000000..116a27cbc --- /dev/null +++ b/testing/web-platform/tests/tools/html5lib/utils/entities.py @@ -0,0 +1,88 @@ +import json + +import html5lib + +def parse(path="html5ents.xml"): + return html5lib.parse(open(path), treebuilder="lxml") + +def entity_table(tree): + return dict((entity_name("".join(tr[0].xpath(".//text()"))), + entity_characters(tr[1].text)) + for tr in tree.xpath("//h:tbody/h:tr", + namespaces={"h":"http://www.w3.org/1999/xhtml"})) + +def entity_name(inp): + return inp.strip() + +def entity_characters(inp): + return "".join(codepoint_to_character(item) + for item in inp.split() + if item) + +def codepoint_to_character(inp): + return ("\U000"+inp[2:]).decode("unicode-escape") + +def make_tests_json(entities): + test_list = make_test_list(entities) + tests_json = {"tests": + [make_test(*item) for item in test_list] + } + return tests_json + +def make_test(name, characters, good): + return { + "description":test_description(name, good), + "input":"&%s"%name, + "output":test_expected(name, characters, good) + } + +def test_description(name, good): + with_semicolon = name.endswith(";") + semicolon_text = {True:"with a semi-colon", + False:"without a semi-colon"}[with_semicolon] + if good: + text = "Named entity: %s %s"%(name, semicolon_text) + else: + text = "Bad named entity: %s %s"%(name, semicolon_text) + return text + +def test_expected(name, characters, good): + rv = [] + if not good or not name.endswith(";"): + rv.append("ParseError") + rv.append(["Character", characters]) + return rv + +def make_test_list(entities): + tests = [] + for entity_name, characters in entities.items(): + if entity_name.endswith(";") and not subentity_exists(entity_name, entities): + tests.append((entity_name[:-1], "&" + entity_name[:-1], False)) + tests.append((entity_name, characters, True)) + return sorted(tests) + +def subentity_exists(entity_name, entities): + for i in range(1, len(entity_name)): + if entity_name[:-i] in entities: + return True + return False + +def make_entities_code(entities): + entities_text = "\n".join(" \"%s\": u\"%s\","%( + name, entities[name].encode( + "unicode-escape").replace("\"", "\\\"")) + for name in sorted(entities.keys())) + return """entities = { +%s +}"""%entities_text + +def main(): + entities = entity_table(parse()) + tests_json = make_tests_json(entities) + json.dump(tests_json, open("namedEntities.test", "w"), indent=4) + code = make_entities_code(entities) + open("entities_constants.py", "w").write(code) + +if __name__ == "__main__": + main() + |