diff options
Diffstat (limited to 'testing/web-platform/tests/tools/html5lib/utils')
3 files changed, 234 insertions, 0 deletions
diff --git a/testing/web-platform/tests/tools/html5lib/utils/entities.py b/testing/web-platform/tests/tools/html5lib/utils/entities.py new file mode 100644 index 000000000..116a27cbc --- /dev/null +++ b/testing/web-platform/tests/tools/html5lib/utils/entities.py @@ -0,0 +1,88 @@ +import json + +import html5lib + +def parse(path="html5ents.xml"): + return html5lib.parse(open(path), treebuilder="lxml") + +def entity_table(tree): + return dict((entity_name("".join(tr[0].xpath(".//text()"))), + entity_characters(tr[1].text)) + for tr in tree.xpath("//h:tbody/h:tr", + namespaces={"h":"http://www.w3.org/1999/xhtml"})) + +def entity_name(inp): + return inp.strip() + +def entity_characters(inp): + return "".join(codepoint_to_character(item) + for item in inp.split() + if item) + +def codepoint_to_character(inp): + return ("\U000"+inp[2:]).decode("unicode-escape") + +def make_tests_json(entities): + test_list = make_test_list(entities) + tests_json = {"tests": + [make_test(*item) for item in test_list] + } + return tests_json + +def make_test(name, characters, good): + return { + "description":test_description(name, good), + "input":"&%s"%name, + "output":test_expected(name, characters, good) + } + +def test_description(name, good): + with_semicolon = name.endswith(";") + semicolon_text = {True:"with a semi-colon", + False:"without a semi-colon"}[with_semicolon] + if good: + text = "Named entity: %s %s"%(name, semicolon_text) + else: + text = "Bad named entity: %s %s"%(name, semicolon_text) + return text + +def test_expected(name, characters, good): + rv = [] + if not good or not name.endswith(";"): + rv.append("ParseError") + rv.append(["Character", characters]) + return rv + +def make_test_list(entities): + tests = [] + for entity_name, characters in entities.items(): + if entity_name.endswith(";") and not subentity_exists(entity_name, entities): + tests.append((entity_name[:-1], "&" + entity_name[:-1], False)) + tests.append((entity_name, characters, True)) + return sorted(tests) + +def subentity_exists(entity_name, entities): + for i in range(1, len(entity_name)): + if entity_name[:-i] in entities: + return True + return False + +def make_entities_code(entities): + entities_text = "\n".join(" \"%s\": u\"%s\","%( + name, entities[name].encode( + "unicode-escape").replace("\"", "\\\"")) + for name in sorted(entities.keys())) + return """entities = { +%s +}"""%entities_text + +def main(): + entities = entity_table(parse()) + tests_json = make_tests_json(entities) + json.dump(tests_json, open("namedEntities.test", "w"), indent=4) + code = make_entities_code(entities) + open("entities_constants.py", "w").write(code) + +if __name__ == "__main__": + main() + diff --git a/testing/web-platform/tests/tools/html5lib/utils/iana_parse.py b/testing/web-platform/tests/tools/html5lib/utils/iana_parse.py new file mode 100644 index 000000000..6dde94c28 --- /dev/null +++ b/testing/web-platform/tests/tools/html5lib/utils/iana_parse.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +import sys +import urllib.request, urllib.error, urllib.parse +import codecs + +def main(): + encodings = [] + f = urllib.request.urlopen(sys.argv[1]) + for line in f: + if line.startswith("Name: ") or line.startswith("Alias: "): + enc = line.split()[1] + try: + codecs.lookup(enc) + if enc.lower not in encodings: + encodings.append(enc.lower()) + except LookupError: + pass + sys.stdout.write("encodings = frozenset((\n") + for enc in encodings: + sys.stdout.write(' "%s",\n'%enc) + sys.stdout.write(' ))') + +if __name__ == "__main__": + main()
\ No newline at end of file diff --git a/testing/web-platform/tests/tools/html5lib/utils/spider.py b/testing/web-platform/tests/tools/html5lib/utils/spider.py new file mode 100644 index 000000000..a7b803197 --- /dev/null +++ b/testing/web-platform/tests/tools/html5lib/utils/spider.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree + +usage: +import spider +s = spider.Spider() +s.spider("http://www.google.com", maxURLs=100) +""" + +import urllib.request, urllib.error, urllib.parse +import urllib.robotparser +import md5 + +import httplib2 + +import html5lib +from html5lib.treebuilders import etree + +class Spider(object): + def __init__(self): + self.unvisitedURLs = set() + self.visitedURLs = set() + self.buggyURLs=set() + self.robotParser = urllib.robotparser.RobotFileParser() + self.contentDigest = {} + self.http = httplib2.Http(".cache") + + def run(self, initialURL, maxURLs=1000): + urlNumber = 0 + self.visitedURLs.add(initialURL) + content = self.loadURL(initialURL) + while maxURLs is None or urlNumber < maxURLs: + if content is not None: + self.parse(content) + urlNumber += 1 + if not self.unvisitedURLs: + break + content = self.loadURL(self.unvisitedURLs.pop()) + + def parse(self, content): + failed = False + p = html5lib.HTMLParser(tree=etree.TreeBuilder) + try: + tree = p.parse(content) + except: + self.buggyURLs.add(self.currentURL) + failed = True + print("BUGGY:", self.currentURL) + self.visitedURLs.add(self.currentURL) + if not failed: + self.updateURLs(tree) + + def loadURL(self, url): + resp, content = self.http.request(url, "GET") + self.currentURL = url + digest = md5.md5(content).hexdigest() + if digest in self.contentDigest: + content = None + self.visitedURLs.add(url) + else: + self.contentDigest[digest] = url + + if resp['status'] != "200": + content = None + + return content + + def updateURLs(self, tree): + """Take all the links in the current document, extract the URLs and + update the list of visited and unvisited URLs according to whether we + have seen them before or not""" + urls = set() + #Remove all links we have already visited + for link in tree.findall(".//a"): + try: + url = urllib.parse.urldefrag(link.attrib['href'])[0] + if (url and url not in self.unvisitedURLs and url + not in self.visitedURLs): + urls.add(url) + except KeyError: + pass + + #Remove all non-http URLs and a dd a sutiable base URL where that is + #missing + newUrls = set() + for url in urls: + splitURL = list(urllib.parse.urlsplit(url)) + if splitURL[0] != "http": + continue + if splitURL[1] == "": + splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1] + newUrls.add(urllib.parse.urlunsplit(splitURL)) + urls = newUrls + + responseHeaders = {} + #Now we want to find the content types of the links we haven't visited + for url in urls: + try: + resp, content = self.http.request(url, "HEAD") + responseHeaders[url] = resp + except AttributeError as KeyError: + #Don't know why this happens + pass + + + #Remove links not of content-type html or pages not found + #XXX - need to deal with other status codes? + toVisit = set([url for url in urls if url in responseHeaders and + "html" in responseHeaders[url]['content-type'] and + responseHeaders[url]['status'] == "200"]) + + #Now check we are allowed to spider the page + for url in toVisit: + robotURL = list(urllib.parse.urlsplit(url)[:2]) + robotURL.extend(["robots.txt", "", ""]) + robotURL = urllib.parse.urlunsplit(robotURL) + self.robotParser.set_url(robotURL) + if not self.robotParser.can_fetch("*", url): + toVisit.remove(url) + + self.visitedURLs.update(urls) + self.unvisitedURLs.update(toVisit) |