diff options
Diffstat (limited to 'testing/web-platform/tests/tools/html5lib/parse.py')
-rwxr-xr-x | testing/web-platform/tests/tools/html5lib/parse.py | 233 |
1 files changed, 233 insertions, 0 deletions
diff --git a/testing/web-platform/tests/tools/html5lib/parse.py b/testing/web-platform/tests/tools/html5lib/parse.py new file mode 100755 index 000000000..9cbf3b8d1 --- /dev/null +++ b/testing/web-platform/tests/tools/html5lib/parse.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python +"""usage: %prog [options] filename + +Parse a document to a tree, with optional profiling +""" + +import sys +import os +import traceback +from optparse import OptionParser + +from html5lib import html5parser, sanitizer +from html5lib.tokenizer import HTMLTokenizer +from html5lib import treebuilders, serializer, treewalkers +from html5lib import constants + +def parse(): + optParser = getOptParser() + opts,args = optParser.parse_args() + encoding = "utf8" + + try: + f = args[-1] + # Try opening from the internet + if f.startswith('http://'): + try: + import urllib.request, urllib.parse, urllib.error, cgi + f = urllib.request.urlopen(f) + contentType = f.headers.get('content-type') + if contentType: + (mediaType, params) = cgi.parse_header(contentType) + encoding = params.get('charset') + except: + pass + elif f == '-': + f = sys.stdin + if sys.version_info[0] >= 3: + encoding = None + else: + try: + # Try opening from file system + f = open(f, "rb") + except IOError as e: + sys.stderr.write("Unable to open file: %s\n" % e) + sys.exit(1) + except IndexError: + sys.stderr.write("No filename provided. Use -h for help\n") + sys.exit(1) + + treebuilder = treebuilders.getTreeBuilder(opts.treebuilder) + + if opts.sanitize: + tokenizer = sanitizer.HTMLSanitizer + else: + tokenizer = HTMLTokenizer + + p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer, debug=opts.log) + + if opts.fragment: + parseMethod = p.parseFragment + else: + parseMethod = p.parse + + if opts.profile: + import cProfile + import pstats + cProfile.runctx("run(parseMethod, f, encoding)", None, + {"run": run, + "parseMethod": parseMethod, + "f": f, + "encoding": encoding}, + "stats.prof") + # XXX - We should use a temp file here + stats = pstats.Stats('stats.prof') + stats.strip_dirs() + stats.sort_stats('time') + stats.print_stats() + elif opts.time: + import time + t0 = time.time() + document = run(parseMethod, f, encoding) + t1 = time.time() + if document: + printOutput(p, document, opts) + t2 = time.time() + sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1)) + else: + sys.stderr.write("\n\nRun took: %fs"%(t1-t0)) + else: + document = run(parseMethod, f, encoding) + if document: + printOutput(p, document, opts) + +def run(parseMethod, f, encoding): + try: + document = parseMethod(f, encoding=encoding) + except: + document = None + traceback.print_exc() + return document + +def printOutput(parser, document, opts): + if opts.encoding: + print("Encoding:", parser.tokenizer.stream.charEncoding) + + for item in parser.log: + print(item) + + if document is not None: + if opts.xml: + sys.stdout.write(document.toxml("utf-8")) + elif opts.tree: + if not hasattr(document,'__getitem__'): + document = [document] + for fragment in document: + print(parser.tree.testSerializer(fragment)) + elif opts.hilite: + sys.stdout.write(document.hilite("utf-8")) + elif opts.html: + kwargs = {} + for opt in serializer.HTMLSerializer.options: + try: + kwargs[opt] = getattr(opts,opt) + except: + pass + if not kwargs['quote_char']: + del kwargs['quote_char'] + + tokens = treewalkers.getTreeWalker(opts.treebuilder)(document) + if sys.version_info[0] >= 3: + encoding = None + else: + encoding = "utf-8" + for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding): + sys.stdout.write(text) + if not text.endswith('\n'): sys.stdout.write('\n') + if opts.error: + errList=[] + for pos, errorcode, datavars in parser.errors: + errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) + sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n") + +def getOptParser(): + parser = OptionParser(usage=__doc__) + + parser.add_option("-p", "--profile", action="store_true", default=False, + dest="profile", help="Use the hotshot profiler to " + "produce a detailed log of the run") + + parser.add_option("-t", "--time", + action="store_true", default=False, dest="time", + help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)") + + parser.add_option("-b", "--treebuilder", action="store", type="string", + dest="treebuilder", default="simpleTree") + + parser.add_option("-e", "--error", action="store_true", default=False, + dest="error", help="Print a list of parse errors") + + parser.add_option("-f", "--fragment", action="store_true", default=False, + dest="fragment", help="Parse as a fragment") + + parser.add_option("", "--tree", action="store_true", default=False, + dest="tree", help="Output as debug tree") + + parser.add_option("-x", "--xml", action="store_true", default=False, + dest="xml", help="Output as xml") + + parser.add_option("", "--no-html", action="store_false", default=True, + dest="html", help="Don't output html") + + parser.add_option("", "--hilite", action="store_true", default=False, + dest="hilite", help="Output as formatted highlighted code.") + + parser.add_option("-c", "--encoding", action="store_true", default=False, + dest="encoding", help="Print character encoding used") + + parser.add_option("", "--inject-meta-charset", action="store_true", + default=False, dest="inject_meta_charset", + help="inject <meta charset>") + + parser.add_option("", "--strip-whitespace", action="store_true", + default=False, dest="strip_whitespace", + help="strip whitespace") + + parser.add_option("", "--omit-optional-tags", action="store_true", + default=False, dest="omit_optional_tags", + help="omit optional tags") + + parser.add_option("", "--quote-attr-values", action="store_true", + default=False, dest="quote_attr_values", + help="quote attribute values") + + parser.add_option("", "--use-best-quote-char", action="store_true", + default=False, dest="use_best_quote_char", + help="use best quote character") + + parser.add_option("", "--quote-char", action="store", + default=None, dest="quote_char", + help="quote character") + + parser.add_option("", "--no-minimize-boolean-attributes", + action="store_false", default=True, + dest="minimize_boolean_attributes", + help="minimize boolean attributes") + + parser.add_option("", "--use-trailing-solidus", action="store_true", + default=False, dest="use_trailing_solidus", + help="use trailing solidus") + + parser.add_option("", "--space-before-trailing-solidus", + action="store_true", default=False, + dest="space_before_trailing_solidus", + help="add space before trailing solidus") + + parser.add_option("", "--escape-lt-in-attrs", action="store_true", + default=False, dest="escape_lt_in_attrs", + help="escape less than signs in attribute values") + + parser.add_option("", "--escape-rcdata", action="store_true", + default=False, dest="escape_rcdata", + help="escape rcdata element values") + + parser.add_option("", "--sanitize", action="store_true", default=False, + dest="sanitize", help="sanitize") + + parser.add_option("-l", "--log", action="store_true", default=False, + dest="log", help="log state transitions") + + return parser + +if __name__ == "__main__": + parse() |