summaryrefslogtreecommitdiffstats
path: root/testing/web-platform/tests/tools/html5lib/utils
diff options
context:
space:
mode:
authorMatt A. Tobin <mattatobin@localhost.localdomain>2018-02-02 04:16:08 -0500
committerMatt A. Tobin <mattatobin@localhost.localdomain>2018-02-02 04:16:08 -0500
commit5f8de423f190bbb79a62f804151bc24824fa32d8 (patch)
tree10027f336435511475e392454359edea8e25895d /testing/web-platform/tests/tools/html5lib/utils
parent49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff)
downloadUXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip
Add m-esr52 at 52.6.0
Diffstat (limited to 'testing/web-platform/tests/tools/html5lib/utils')
-rw-r--r--testing/web-platform/tests/tools/html5lib/utils/entities.py88
-rw-r--r--testing/web-platform/tests/tools/html5lib/utils/iana_parse.py24
-rw-r--r--testing/web-platform/tests/tools/html5lib/utils/spider.py122
3 files changed, 234 insertions, 0 deletions
diff --git a/testing/web-platform/tests/tools/html5lib/utils/entities.py b/testing/web-platform/tests/tools/html5lib/utils/entities.py
new file mode 100644
index 000000000..116a27cbc
--- /dev/null
+++ b/testing/web-platform/tests/tools/html5lib/utils/entities.py
@@ -0,0 +1,88 @@
+import json
+
+import html5lib
+
+def parse(path="html5ents.xml"):
+ return html5lib.parse(open(path), treebuilder="lxml")
+
+def entity_table(tree):
+ return dict((entity_name("".join(tr[0].xpath(".//text()"))),
+ entity_characters(tr[1].text))
+ for tr in tree.xpath("//h:tbody/h:tr",
+ namespaces={"h":"http://www.w3.org/1999/xhtml"}))
+
+def entity_name(inp):
+ return inp.strip()
+
+def entity_characters(inp):
+ return "".join(codepoint_to_character(item)
+ for item in inp.split()
+ if item)
+
+def codepoint_to_character(inp):
+ return ("\U000"+inp[2:]).decode("unicode-escape")
+
+def make_tests_json(entities):
+ test_list = make_test_list(entities)
+ tests_json = {"tests":
+ [make_test(*item) for item in test_list]
+ }
+ return tests_json
+
+def make_test(name, characters, good):
+ return {
+ "description":test_description(name, good),
+ "input":"&%s"%name,
+ "output":test_expected(name, characters, good)
+ }
+
+def test_description(name, good):
+ with_semicolon = name.endswith(";")
+ semicolon_text = {True:"with a semi-colon",
+ False:"without a semi-colon"}[with_semicolon]
+ if good:
+ text = "Named entity: %s %s"%(name, semicolon_text)
+ else:
+ text = "Bad named entity: %s %s"%(name, semicolon_text)
+ return text
+
+def test_expected(name, characters, good):
+ rv = []
+ if not good or not name.endswith(";"):
+ rv.append("ParseError")
+ rv.append(["Character", characters])
+ return rv
+
+def make_test_list(entities):
+ tests = []
+ for entity_name, characters in entities.items():
+ if entity_name.endswith(";") and not subentity_exists(entity_name, entities):
+ tests.append((entity_name[:-1], "&" + entity_name[:-1], False))
+ tests.append((entity_name, characters, True))
+ return sorted(tests)
+
+def subentity_exists(entity_name, entities):
+ for i in range(1, len(entity_name)):
+ if entity_name[:-i] in entities:
+ return True
+ return False
+
+def make_entities_code(entities):
+ entities_text = "\n".join(" \"%s\": u\"%s\","%(
+ name, entities[name].encode(
+ "unicode-escape").replace("\"", "\\\""))
+ for name in sorted(entities.keys()))
+ return """entities = {
+%s
+}"""%entities_text
+
+def main():
+ entities = entity_table(parse())
+ tests_json = make_tests_json(entities)
+ json.dump(tests_json, open("namedEntities.test", "w"), indent=4)
+ code = make_entities_code(entities)
+ open("entities_constants.py", "w").write(code)
+
+if __name__ == "__main__":
+ main()
+
diff --git a/testing/web-platform/tests/tools/html5lib/utils/iana_parse.py b/testing/web-platform/tests/tools/html5lib/utils/iana_parse.py
new file mode 100644
index 000000000..6dde94c28
--- /dev/null
+++ b/testing/web-platform/tests/tools/html5lib/utils/iana_parse.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+import sys
+import urllib.request, urllib.error, urllib.parse
+import codecs
+
+def main():
+ encodings = []
+ f = urllib.request.urlopen(sys.argv[1])
+ for line in f:
+ if line.startswith("Name: ") or line.startswith("Alias: "):
+ enc = line.split()[1]
+ try:
+ codecs.lookup(enc)
+ if enc.lower not in encodings:
+ encodings.append(enc.lower())
+ except LookupError:
+ pass
+ sys.stdout.write("encodings = frozenset((\n")
+ for enc in encodings:
+ sys.stdout.write(' "%s",\n'%enc)
+ sys.stdout.write(' ))')
+
+if __name__ == "__main__":
+ main() \ No newline at end of file
diff --git a/testing/web-platform/tests/tools/html5lib/utils/spider.py b/testing/web-platform/tests/tools/html5lib/utils/spider.py
new file mode 100644
index 000000000..a7b803197
--- /dev/null
+++ b/testing/web-platform/tests/tools/html5lib/utils/spider.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree
+
+usage:
+import spider
+s = spider.Spider()
+s.spider("http://www.google.com", maxURLs=100)
+"""
+
+import urllib.request, urllib.error, urllib.parse
+import urllib.robotparser
+import md5
+
+import httplib2
+
+import html5lib
+from html5lib.treebuilders import etree
+
+class Spider(object):
+ def __init__(self):
+ self.unvisitedURLs = set()
+ self.visitedURLs = set()
+ self.buggyURLs=set()
+ self.robotParser = urllib.robotparser.RobotFileParser()
+ self.contentDigest = {}
+ self.http = httplib2.Http(".cache")
+
+ def run(self, initialURL, maxURLs=1000):
+ urlNumber = 0
+ self.visitedURLs.add(initialURL)
+ content = self.loadURL(initialURL)
+ while maxURLs is None or urlNumber < maxURLs:
+ if content is not None:
+ self.parse(content)
+ urlNumber += 1
+ if not self.unvisitedURLs:
+ break
+ content = self.loadURL(self.unvisitedURLs.pop())
+
+ def parse(self, content):
+ failed = False
+ p = html5lib.HTMLParser(tree=etree.TreeBuilder)
+ try:
+ tree = p.parse(content)
+ except:
+ self.buggyURLs.add(self.currentURL)
+ failed = True
+ print("BUGGY:", self.currentURL)
+ self.visitedURLs.add(self.currentURL)
+ if not failed:
+ self.updateURLs(tree)
+
+ def loadURL(self, url):
+ resp, content = self.http.request(url, "GET")
+ self.currentURL = url
+ digest = md5.md5(content).hexdigest()
+ if digest in self.contentDigest:
+ content = None
+ self.visitedURLs.add(url)
+ else:
+ self.contentDigest[digest] = url
+
+ if resp['status'] != "200":
+ content = None
+
+ return content
+
+ def updateURLs(self, tree):
+ """Take all the links in the current document, extract the URLs and
+ update the list of visited and unvisited URLs according to whether we
+ have seen them before or not"""
+ urls = set()
+ #Remove all links we have already visited
+ for link in tree.findall(".//a"):
+ try:
+ url = urllib.parse.urldefrag(link.attrib['href'])[0]
+ if (url and url not in self.unvisitedURLs and url
+ not in self.visitedURLs):
+ urls.add(url)
+ except KeyError:
+ pass
+
+ #Remove all non-http URLs and a dd a sutiable base URL where that is
+ #missing
+ newUrls = set()
+ for url in urls:
+ splitURL = list(urllib.parse.urlsplit(url))
+ if splitURL[0] != "http":
+ continue
+ if splitURL[1] == "":
+ splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1]
+ newUrls.add(urllib.parse.urlunsplit(splitURL))
+ urls = newUrls
+
+ responseHeaders = {}
+ #Now we want to find the content types of the links we haven't visited
+ for url in urls:
+ try:
+ resp, content = self.http.request(url, "HEAD")
+ responseHeaders[url] = resp
+ except AttributeError as KeyError:
+ #Don't know why this happens
+ pass
+
+
+ #Remove links not of content-type html or pages not found
+ #XXX - need to deal with other status codes?
+ toVisit = set([url for url in urls if url in responseHeaders and
+ "html" in responseHeaders[url]['content-type'] and
+ responseHeaders[url]['status'] == "200"])
+
+ #Now check we are allowed to spider the page
+ for url in toVisit:
+ robotURL = list(urllib.parse.urlsplit(url)[:2])
+ robotURL.extend(["robots.txt", "", ""])
+ robotURL = urllib.parse.urlunsplit(robotURL)
+ self.robotParser.set_url(robotURL)
+ if not self.robotParser.can_fetch("*", url):
+ toVisit.remove(url)
+
+ self.visitedURLs.update(urls)
+ self.unvisitedURLs.update(toVisit)