#!/usr/bin/env python """Spider to try and find bugs in the parser. Requires httplib2 and elementtree usage: import spider s = spider.Spider() s.spider("http://www.google.com", maxURLs=100) """ import urllib.request, urllib.error, urllib.parse import urllib.robotparser import md5 import httplib2 import html5lib from html5lib.treebuilders import etree class Spider(object): def __init__(self): self.unvisitedURLs = set() self.visitedURLs = set() self.buggyURLs=set() self.robotParser = urllib.robotparser.RobotFileParser() self.contentDigest = {} self.http = httplib2.Http(".cache") def run(self, initialURL, maxURLs=1000): urlNumber = 0 self.visitedURLs.add(initialURL) content = self.loadURL(initialURL) while maxURLs is None or urlNumber < maxURLs: if content is not None: self.parse(content) urlNumber += 1 if not self.unvisitedURLs: break content = self.loadURL(self.unvisitedURLs.pop()) def parse(self, content): failed = False p = html5lib.HTMLParser(tree=etree.TreeBuilder) try: tree = p.parse(content) except: self.buggyURLs.add(self.currentURL) failed = True print("BUGGY:", self.currentURL) self.visitedURLs.add(self.currentURL) if not failed: self.updateURLs(tree) def loadURL(self, url): resp, content = self.http.request(url, "GET") self.currentURL = url digest = md5.md5(content).hexdigest() if digest in self.contentDigest: content = None self.visitedURLs.add(url) else: self.contentDigest[digest] = url if resp['status'] != "200": content = None return content def updateURLs(self, tree): """Take all the links in the current document, extract the URLs and update the list of visited and unvisited URLs according to whether we have seen them before or not""" urls = set() #Remove all links we have already visited for link in tree.findall(".//a"): try: url = urllib.parse.urldefrag(link.attrib['href'])[0] if (url and url not in self.unvisitedURLs and url not in self.visitedURLs): urls.add(url) except KeyError: pass #Remove all non-http URLs and a dd a sutiable base URL where that is #missing newUrls = set() for url in urls: splitURL = list(urllib.parse.urlsplit(url)) if splitURL[0] != "http": continue if splitURL[1] == "": splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1] newUrls.add(urllib.parse.urlunsplit(splitURL)) urls = newUrls responseHeaders = {} #Now we want to find the content types of the links we haven't visited for url in urls: try: resp, content = self.http.request(url, "HEAD") responseHeaders[url] = resp except AttributeError as KeyError: #Don't know why this happens pass #Remove links not of content-type html or pages not found #XXX - need to deal with other status codes? toVisit = set([url for url in urls if url in responseHeaders and "html" in responseHeaders[url]['content-type'] and responseHeaders[url]['status'] == "200"]) #Now check we are allowed to spider the page for url in toVisit: robotURL = list(urllib.parse.urlsplit(url)[:2]) robotURL.extend(["robots.txt", "", ""]) robotURL = urllib.parse.urlunsplit(robotURL) self.robotParser.set_url(robotURL) if not self.robotParser.can_fetch("*", url): toVisit.remove(url) self.visitedURLs.update(urls) self.unvisitedURLs.update(toVisit)