import html5lib
import html5lib.treebuilders.dom

# Expected use:
#   curl --compressed http://www.whatwg.org/specs/web-apps/current-work/ >current-work
#   python specextract.py
#
# Generates current-work-canvas.xhtml, for use by gentest.py to create the annotated spec document

def extract():
    parser = html5lib.html5parser.HTMLParser(tree=html5lib.treebuilders.dom.TreeBuilder)
    doc = parser.parse(open('current-work', "r"), encoding='utf-8')

    head = doc.getElementsByTagName('head')[0]
    for n in head.childNodes:
        if n.tagName == 'script':
            head.removeChild(n)

    header = doc.getElementsByTagName('header')[0]
    #thecanvas = doc.getElementById('the-canvas') # doesn't work (?!)
    thecanvas = [ n for n in doc.getElementsByTagName('h4') if n.getAttribute('id') == 'the-canvas-element' ][0]

    keep = [header, thecanvas]
    node = thecanvas.nextSibling
    while node.nodeName != 'h4':
        keep.append(node)
        node = node.nextSibling
    p = thecanvas.parentNode
    for n in p.childNodes[:]:
        if n not in keep:
            p.removeChild(n)

    for n in header.childNodes[3:-4]:
        header.removeChild(n)

    def make_absolute(uri):
        if uri.startswith('data:'):
            return uri
        elif uri[0] == '/':
            return 'http://www.whatwg.org' + uri
        else:
            return 'http://www.whatwg.org/specs/web-apps/current-work/' + uri

    # Fix the stylesheet, icon and image references
    for e in doc.getElementsByTagName('link'):
        e.setAttribute('href', make_absolute(e.getAttribute('href')))
    for img in doc.getElementsByTagName('img'):
        img.setAttribute('src', make_absolute(img.getAttribute('src')))

    # Convert to XHTML, because it's quicker to re-parse than HTML5
    doc.documentElement.setAttribute('xmlns', 'http://www.w3.org/1999/xhtml')
    doc.documentElement.setAttribute('xml:lang', doc.documentElement.getAttribute('lang'))
    doc.removeChild(doc.firstChild) # remove the DOCTYPE

    open('current-work-canvas.xhtml', 'w').write(doc.toxml(encoding = 'UTF-8'))

extract()