# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. #default_value:foo #include: other.manifest # #[test_name.js] # expected: ERROR # # [subtest 1] # expected: # os == win: FAIL #This is a comment # PASS # # TODO: keep comments in the tree import types from cStringIO import StringIO from node import * class ParseError(Exception): def __init__(self, filename, line, detail): self.line = line self.filename = filename self.detail = detail self.message = "%s: %s line %s" % (self.detail, self.filename, self.line) Exception.__init__(self, self.message) eol = object group_start = object group_end = object digits = "0123456789" open_parens = "[(" close_parens = "])" parens = open_parens + close_parens operator_chars = "=!" unary_operators = ["not"] binary_operators = ["==", "!=", "and", "or"] operators = ["==", "!=", "not", "and", "or"] atoms = {"True": True, "False": False, "Reset": object()} def decode(byte_str): return byte_str.decode("utf8") def precedence(operator_node): return len(operators) - operators.index(operator_node.data) class TokenTypes(object): def __init__(self): for type in ["group_start", "group_end", "paren", "list_start", "list_end", "separator", "ident", "string", "number", "atom", "eof"]: setattr(self, type, type) token_types = TokenTypes() class Tokenizer(object): def __init__(self): self.reset() def reset(self): self.indent_levels = [0] self.state = self.line_start_state self.next_state = self.data_line_state self.line_number = 0 def tokenize(self, stream): self.reset() if type(stream) in types.StringTypes: stream = StringIO(stream) if not hasattr(stream, "name"): self.filename = "" else: self.filename = stream.name self.next_line_state = self.line_start_state for i, line in enumerate(stream): self.state = self.next_line_state assert self.state is not None states = [] self.next_line_state = None self.line_number = i + 1 self.index = 0 self.line = line.rstrip() while self.state != self.eol_state: states.append(self.state) tokens = self.state() if tokens: for token in tokens: yield token self.state() while True: yield (token_types.eof, None) def char(self): if self.index == len(self.line): return eol return self.line[self.index] def consume(self): if self.index < len(self.line): self.index += 1 def peek(self, length): return self.line[self.index:self.index + length] def skip_whitespace(self): while self.char() == " ": self.consume() def eol_state(self): if self.next_line_state is None: self.next_line_state = self.line_start_state def line_start_state(self): self.skip_whitespace() if self.char() == eol: self.state = self.eol_state return if self.index > self.indent_levels[-1]: self.indent_levels.append(self.index) yield (token_types.group_start, None) else: while self.index < self.indent_levels[-1]: self.indent_levels.pop() yield (token_types.group_end, None) # This is terrible; if we were parsing an expression # then the next_state will be expr_or_value but when we deindent # it must always be a heading or key next so we go back to data_line_state self.next_state = self.data_line_state if self.index != self.indent_levels[-1]: raise ParseError(self.filename, self.line_number, "Unexpected indent") self.state = self.next_state def data_line_state(self): if self.char() == "[": yield (token_types.paren, self.char()) self.consume() self.state = self.heading_state else: self.state = self.key_state def heading_state(self): rv = "" while True: c = self.char() if c == "\\": rv += self.consume_escape() elif c == "]": break elif c == eol: raise ParseError(self.filename, self.line_number, "EOL in heading") else: rv += c self.consume() yield (token_types.string, decode(rv)) yield (token_types.paren, "]") self.consume() self.state = self.line_end_state self.next_state = self.data_line_state def key_state(self): rv = "" while True: c = self.char() if c == " ": self.skip_whitespace() if self.char() != ":": raise ParseError(self.filename, self.line_number, "Space in key name") break elif c == ":": break elif c == eol: raise ParseError(self.filename, self.line_number, "EOL in key name (missing ':'?)") elif c == "\\": rv += self.consume_escape() else: rv += c self.consume() yield (token_types.string, decode(rv)) yield (token_types.separator, ":") self.consume() self.state = self.after_key_state def after_key_state(self): self.skip_whitespace() c = self.char() if c == "#": self.next_state = self.expr_or_value_state self.state = self.comment_state elif c == eol: self.next_state = self.expr_or_value_state self.state = self.eol_state elif c == "[": self.state = self.list_start_state else: self.state = self.value_state def list_start_state(self): yield (token_types.list_start, "[") self.consume() self.state = self.list_value_start_state def list_value_start_state(self): self.skip_whitespace() if self.char() == "]": self.state = self.list_end_state elif self.char() in ("'", '"'): quote_char = self.char() self.consume() yield (token_types.string, self.consume_string(quote_char)) self.skip_whitespace() if self.char() == "]": self.state = self.list_end_state elif self.char() != ",": raise ParseError(self.filename, self.line_number, "Junk after quoted string") self.consume() elif self.char() == "#": self.state = self.comment_state self.next_line_state = self.list_value_start_state elif self.char() == eol: self.next_line_state = self.list_value_start_state self.state = self.eol_state elif self.char() == ",": raise ParseError(self.filename, self.line_number, "List item started with separator") elif self.char() == "@": self.state = self.list_value_atom_state else: self.state = self.list_value_state def list_value_state(self): rv = "" spaces = 0 while True: c = self.char() if c == "\\": escape = self.consume_escape() rv += escape elif c == eol: raise ParseError(self.filename, self.line_number, "EOL in list value") elif c == "#": raise ParseError(self.filename, self.line_number, "EOL in list value (comment)") elif c == ",": self.state = self.list_value_start_state self.consume() break elif c == " ": spaces += 1 self.consume() elif c == "]": self.state = self.list_end_state self.consume() break else: rv += " " * spaces spaces = 0 rv += c self.consume() if rv: yield (token_types.string, decode(rv)) def list_value_atom_state(self): self.consume() for _, value in self.list_value_state(): yield token_types.atom, value def list_end_state(self): self.consume() yield (token_types.list_end, "]") self.state = self.line_end_state def value_state(self): self.skip_whitespace() if self.char() in ("'", '"'): quote_char = self.char() self.consume() yield (token_types.string, self.consume_string(quote_char)) if self.char() == "#": self.state = self.comment_state else: self.state = self.line_end_state elif self.char() == "@": self.consume() for _, value in self.value_inner_state(): yield token_types.atom, value else: self.state = self.value_inner_state def value_inner_state(self): rv = "" spaces = 0 while True: c = self.char() if c == "\\": rv += self.consume_escape() elif c == "#": self.state = self.comment_state break elif c == " ": # prevent whitespace before comments from being included in the value spaces += 1 self.consume() elif c == eol: self.state = self.line_end_state break else: rv += " " * spaces spaces = 0 rv += c self.consume() yield (token_types.string, decode(rv)) def comment_state(self): while self.char() is not eol: self.consume() self.state = self.eol_state def line_end_state(self): self.skip_whitespace() c = self.char() if c == "#": self.state = self.comment_state elif c == eol: self.state = self.eol_state else: raise ParseError(self.filename, self.line_number, "Junk before EOL %s" % c) def consume_string(self, quote_char): rv = "" while True: c = self.char() if c == "\\": rv += self.consume_escape() elif c == quote_char: self.consume() break elif c == eol: raise ParseError(self.filename, self.line_number, "EOL in quoted string") else: rv += c self.consume() return decode(rv) def expr_or_value_state(self): if self.peek(3) == "if ": self.state = self.expr_state else: self.state = self.value_state def expr_state(self): self.skip_whitespace() c = self.char() if c == eol: raise ParseError(self.filename, self.line_number, "EOL in expression") elif c in "'\"": self.consume() yield (token_types.string, self.consume_string(c)) elif c == "#": raise ParseError(self.filename, self.line_number, "Comment before end of expression") elif c == ":": yield (token_types.separator, c) self.consume() self.state = self.value_state elif c in parens: self.consume() yield (token_types.paren, c) elif c in ("!", "="): self.state = self.operator_state elif c in digits: self.state = self.digit_state else: self.state = self.ident_state def operator_state(self): # Only symbolic operators index_0 = self.index while True: c = self.char() if c == eol: break elif c in operator_chars: self.consume() else: self.state = self.expr_state break yield (token_types.ident, self.line[index_0:self.index]) def digit_state(self): index_0 = self.index seen_dot = False while True: c = self.char() if c == eol: break elif c in digits: self.consume() elif c == ".": if seen_dot: raise ParseError(self.filename, self.line_number, "Invalid number") self.consume() seen_dot = True elif c in parens: break elif c in operator_chars: break elif c == " ": break elif c == ":": break else: raise ParseError(self.filename, self.line_number, "Invalid character in number") self.state = self.expr_state yield (token_types.number, self.line[index_0:self.index]) def ident_state(self): index_0 = self.index while True: c = self.char() if c == eol: break elif c == ".": break elif c in parens: break elif c in operator_chars: break elif c == " ": break elif c == ":": break else: self.consume() self.state = self.expr_state yield (token_types.ident, self.line[index_0:self.index]) def consume_escape(self): assert self.char() == "\\" self.consume() c = self.char() self.consume() if c == "x": return self.decode_escape(2) elif c == "u": return self.decode_escape(4) elif c == "U": return self.decode_escape(6) elif c in ["a", "b", "f", "n", "r", "t", "v"]: return eval("'\%s'" % c) elif c is eol: raise ParseError(self.filename, self.line_number, "EOL in escape") else: return c def decode_escape(self, length): value = 0 for i in xrange(length): c = self.char() value *= 16 value += self.escape_value(c) self.consume() return unichr(value).encode("utf8") def escape_value(self, c): if '0' <= c <= '9': return ord(c) - ord('0') elif 'a' <= c <= 'f': return ord(c) - ord('a') + 10 elif 'A' <= c <= 'F': return ord(c) - ord('A') + 10 else: raise ParseError(self.filename, self.line_number, "Invalid character escape") class Parser(object): def __init__(self): self.reset() def reset(self): self.token = None self.unary_operators = "!" self.binary_operators = frozenset(["&&", "||", "=="]) self.tokenizer = Tokenizer() self.token_generator = None self.tree = Treebuilder(DataNode(None)) self.expr_builder = None self.expr_builders = [] def parse(self, input): self.reset() self.token_generator = self.tokenizer.tokenize(input) self.consume() self.manifest() return self.tree.node def consume(self): self.token = self.token_generator.next() def expect(self, type, value=None): if self.token[0] != type: raise ParseError if value is not None: if self.token[1] != value: raise ParseError self.consume() def manifest(self): self.data_block() self.expect(token_types.eof) def data_block(self): while self.token[0] == token_types.string: self.tree.append(KeyValueNode(self.token[1])) self.consume() self.expect(token_types.separator) self.value_block() self.tree.pop() while self.token == (token_types.paren, "["): self.consume() if self.token[0] != token_types.string: raise ParseError self.tree.append(DataNode(self.token[1])) self.consume() self.expect(token_types.paren, "]") if self.token[0] == token_types.group_start: self.consume() self.data_block() self.eof_or_end_group() self.tree.pop() def eof_or_end_group(self): if self.token[0] != token_types.eof: self.expect(token_types.group_end) def value_block(self): if self.token[0] == token_types.list_start: self.consume() self.list_value() elif self.token[0] == token_types.string: self.value() elif self.token[0] == token_types.group_start: self.consume() self.expression_values() if self.token[0] == token_types.string: self.value() self.eof_or_end_group() elif self.token[0] == token_types.atom: self.atom() else: raise ParseError def list_value(self): self.tree.append(ListNode()) while self.token[0] in (token_types.atom, token_types.string): if self.token[0] == token_types.atom: self.atom() else: self.value() self.expect(token_types.list_end) self.tree.pop() def expression_values(self): while self.token == (token_types.ident, "if"): self.consume() self.tree.append(ConditionalNode()) self.expr_start() self.expect(token_types.separator) if self.token[0] == token_types.string: self.value() else: raise ParseError self.tree.pop() def value(self): self.tree.append(ValueNode(self.token[1])) self.consume() self.tree.pop() def atom(self): if self.token[1] not in atoms: raise ParseError(self.tokenizer.filename, self.tokenizer.line_number, "Unrecognised symbol @%s" % self.token[1]) self.tree.append(AtomNode(atoms[self.token[1]])) self.consume() self.tree.pop() def expr_start(self): self.expr_builder = ExpressionBuilder(self.tokenizer) self.expr_builders.append(self.expr_builder) self.expr() expression = self.expr_builder.finish() self.expr_builders.pop() self.expr_builder = self.expr_builders[-1] if self.expr_builders else None if self.expr_builder: self.expr_builder.operands[-1].children[-1].append(expression) else: self.tree.append(expression) self.tree.pop() def expr(self): self.expr_operand() while (self.token[0] == token_types.ident and self.token[1] in binary_operators): self.expr_bin_op() self.expr_operand() def expr_operand(self): if self.token == (token_types.paren, "("): self.consume() self.expr_builder.left_paren() self.expr() self.expect(token_types.paren, ")") self.expr_builder.right_paren() elif self.token[0] == token_types.ident and self.token[1] in unary_operators: self.expr_unary_op() self.expr_operand() elif self.token[0] in [token_types.string, token_types.ident]: self.expr_value() elif self.token[0] == token_types.number: self.expr_number() else: raise ParseError(self.tokenizer.filename, self.tokenizer.line_number, "Unrecognised operand") def expr_unary_op(self): if self.token[1] in unary_operators: self.expr_builder.push_operator(UnaryOperatorNode(self.token[1])) self.consume() else: raise ParseError(self.tokenizer.filename, self.tokenizer.line_number, "Expected unary operator") def expr_bin_op(self): if self.token[1] in binary_operators: self.expr_builder.push_operator(BinaryOperatorNode(self.token[1])) self.consume() else: raise ParseError(self.tokenizer.filename, self.tokenizer.line_number, "Expected binary operator") def expr_value(self): node_type = {token_types.string: StringNode, token_types.ident: VariableNode}[self.token[0]] self.expr_builder.push_operand(node_type(self.token[1])) self.consume() if self.token == (token_types.paren, "["): self.consume() self.expr_builder.operands[-1].append(IndexNode()) self.expr_start() self.expect(token_types.paren, "]") def expr_number(self): self.expr_builder.push_operand(NumberNode(self.token[1])) self.consume() class Treebuilder(object): def __init__(self, root): self.root = root self.node = root def append(self, node): self.node.append(node) self.node = node return node def pop(self): node = self.node self.node = self.node.parent return node class ExpressionBuilder(object): def __init__(self, tokenizer): self.operands = [] self.operators = [None] self.tokenizer = tokenizer def finish(self): while self.operators[-1] is not None: self.pop_operator() rv = self.pop_operand() assert self.is_empty() return rv def left_paren(self): self.operators.append(None) def right_paren(self): while self.operators[-1] is not None: self.pop_operator() if not self.operators: raise ParseError(self.tokenizer.filename, self.tokenizer.line, "Unbalanced parens") assert self.operators.pop() is None def push_operator(self, operator): assert operator is not None while self.precedence(self.operators[-1]) > self.precedence(operator): self.pop_operator() self.operators.append(operator) def pop_operator(self): operator = self.operators.pop() if isinstance(operator, BinaryOperatorNode): operand_1 = self.operands.pop() operand_0 = self.operands.pop() self.operands.append(BinaryExpressionNode(operator, operand_0, operand_1)) else: operand_0 = self.operands.pop() self.operands.append(UnaryExpressionNode(operator, operand_0)) def push_operand(self, node): self.operands.append(node) def pop_operand(self): return self.operands.pop() def is_empty(self): return len(self.operands) == 0 and all(item is None for item in self.operators) def precedence(self, operator): if operator is None: return 0 return precedence(operator) def parse(stream): p = Parser() return p.parse(stream)