import html5lib import html5lib.treebuilders.dom # Expected use: # curl --compressed http://www.whatwg.org/specs/web-apps/current-work/ >current-work # python specextract.py # # Generates current-work-canvas.xhtml, for use by gentest.py to create the annotated spec document def extract(): parser = html5lib.html5parser.HTMLParser(tree=html5lib.treebuilders.dom.TreeBuilder) doc = parser.parse(open('current-work', "r"), encoding='utf-8') head = doc.getElementsByTagName('head')[0] for n in head.childNodes: if n.tagName == 'script': head.removeChild(n) header = doc.getElementsByTagName('header')[0] #thecanvas = doc.getElementById('the-canvas') # doesn't work (?!) thecanvas = [ n for n in doc.getElementsByTagName('h4') if n.getAttribute('id') == 'the-canvas-element' ][0] keep = [header, thecanvas] node = thecanvas.nextSibling while node.nodeName != 'h4': keep.append(node) node = node.nextSibling p = thecanvas.parentNode for n in p.childNodes[:]: if n not in keep: p.removeChild(n) for n in header.childNodes[3:-4]: header.removeChild(n) def make_absolute(uri): if uri.startswith('data:'): return uri elif uri[0] == '/': return 'http://www.whatwg.org' + uri else: return 'http://www.whatwg.org/specs/web-apps/current-work/' + uri # Fix the stylesheet, icon and image references for e in doc.getElementsByTagName('link'): e.setAttribute('href', make_absolute(e.getAttribute('href'))) for img in doc.getElementsByTagName('img'): img.setAttribute('src', make_absolute(img.getAttribute('src'))) # Convert to XHTML, because it's quicker to re-parse than HTML5 doc.documentElement.setAttribute('xmlns', 'http://www.w3.org/1999/xhtml') doc.documentElement.setAttribute('xml:lang', doc.documentElement.getAttribute('lang')) doc.removeChild(doc.firstChild) # remove the DOCTYPE open('current-work-canvas.xhtml', 'w').write(doc.toxml(encoding = 'UTF-8')) extract()