diff options
Diffstat (limited to 'testing/web-platform/tests/tools/html5lib/utils/spider.py')
-rw-r--r-- | testing/web-platform/tests/tools/html5lib/utils/spider.py | 122 |
1 files changed, 122 insertions, 0 deletions
diff --git a/testing/web-platform/tests/tools/html5lib/utils/spider.py b/testing/web-platform/tests/tools/html5lib/utils/spider.py new file mode 100644 index 000000000..a7b803197 --- /dev/null +++ b/testing/web-platform/tests/tools/html5lib/utils/spider.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree + +usage: +import spider +s = spider.Spider() +s.spider("http://www.google.com", maxURLs=100) +""" + +import urllib.request, urllib.error, urllib.parse +import urllib.robotparser +import md5 + +import httplib2 + +import html5lib +from html5lib.treebuilders import etree + +class Spider(object): + def __init__(self): + self.unvisitedURLs = set() + self.visitedURLs = set() + self.buggyURLs=set() + self.robotParser = urllib.robotparser.RobotFileParser() + self.contentDigest = {} + self.http = httplib2.Http(".cache") + + def run(self, initialURL, maxURLs=1000): + urlNumber = 0 + self.visitedURLs.add(initialURL) + content = self.loadURL(initialURL) + while maxURLs is None or urlNumber < maxURLs: + if content is not None: + self.parse(content) + urlNumber += 1 + if not self.unvisitedURLs: + break + content = self.loadURL(self.unvisitedURLs.pop()) + + def parse(self, content): + failed = False + p = html5lib.HTMLParser(tree=etree.TreeBuilder) + try: + tree = p.parse(content) + except: + self.buggyURLs.add(self.currentURL) + failed = True + print("BUGGY:", self.currentURL) + self.visitedURLs.add(self.currentURL) + if not failed: + self.updateURLs(tree) + + def loadURL(self, url): + resp, content = self.http.request(url, "GET") + self.currentURL = url + digest = md5.md5(content).hexdigest() + if digest in self.contentDigest: + content = None + self.visitedURLs.add(url) + else: + self.contentDigest[digest] = url + + if resp['status'] != "200": + content = None + + return content + + def updateURLs(self, tree): + """Take all the links in the current document, extract the URLs and + update the list of visited and unvisited URLs according to whether we + have seen them before or not""" + urls = set() + #Remove all links we have already visited + for link in tree.findall(".//a"): + try: + url = urllib.parse.urldefrag(link.attrib['href'])[0] + if (url and url not in self.unvisitedURLs and url + not in self.visitedURLs): + urls.add(url) + except KeyError: + pass + + #Remove all non-http URLs and a dd a sutiable base URL where that is + #missing + newUrls = set() + for url in urls: + splitURL = list(urllib.parse.urlsplit(url)) + if splitURL[0] != "http": + continue + if splitURL[1] == "": + splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1] + newUrls.add(urllib.parse.urlunsplit(splitURL)) + urls = newUrls + + responseHeaders = {} + #Now we want to find the content types of the links we haven't visited + for url in urls: + try: + resp, content = self.http.request(url, "HEAD") + responseHeaders[url] = resp + except AttributeError as KeyError: + #Don't know why this happens + pass + + + #Remove links not of content-type html or pages not found + #XXX - need to deal with other status codes? + toVisit = set([url for url in urls if url in responseHeaders and + "html" in responseHeaders[url]['content-type'] and + responseHeaders[url]['status'] == "200"]) + + #Now check we are allowed to spider the page + for url in toVisit: + robotURL = list(urllib.parse.urlsplit(url)[:2]) + robotURL.extend(["robots.txt", "", ""]) + robotURL = urllib.parse.urlunsplit(robotURL) + self.robotParser.set_url(robotURL) + if not self.robotParser.can_fetch("*", url): + toVisit.remove(url) + + self.visitedURLs.update(urls) + self.unvisitedURLs.update(toVisit) |