py: Improve memory management for parser; add lexer error for bad line cont.

This commit is contained in:
Damien George 2014-01-12 13:55:24 +00:00
parent 97eb73cf84
commit 69a818d418
3 changed files with 57 additions and 22 deletions

View File

@ -299,8 +299,15 @@ static void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
// backslash (outside string literals) must appear just before a physical newline // backslash (outside string literals) must appear just before a physical newline
next_char(lex); next_char(lex);
if (!is_physical_newline(lex)) { if (!is_physical_newline(lex)) {
// TODO SyntaxError // SyntaxError: unexpected character after line continuation character
assert(0); tok->src_name = lex->name;
tok->src_line = lex->line;
tok->src_column = lex->column;
tok->kind = MP_TOKEN_BAD_LINE_CONTINUATION;
vstr_reset(&lex->vstr);
tok->str = vstr_str(&lex->vstr);
tok->len = 0;
return;
} else { } else {
next_char(lex); next_char(lex);
} }

View File

@ -10,19 +10,20 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_INVALID, MP_TOKEN_INVALID,
MP_TOKEN_DEDENT_MISMATCH, MP_TOKEN_DEDENT_MISMATCH,
MP_TOKEN_LONELY_STRING_OPEN, MP_TOKEN_LONELY_STRING_OPEN,
MP_TOKEN_BAD_LINE_CONTINUATION,
MP_TOKEN_NEWLINE, // 4 MP_TOKEN_NEWLINE, // 5
MP_TOKEN_INDENT, // 5 MP_TOKEN_INDENT, // 6
MP_TOKEN_DEDENT, // 6 MP_TOKEN_DEDENT, // 7
MP_TOKEN_NAME, // 7 MP_TOKEN_NAME, // 8
MP_TOKEN_NUMBER, MP_TOKEN_NUMBER,
MP_TOKEN_STRING, MP_TOKEN_STRING,
MP_TOKEN_BYTES, MP_TOKEN_BYTES,
MP_TOKEN_ELLIPSIS, MP_TOKEN_ELLIPSIS,
MP_TOKEN_KW_FALSE, // 12 MP_TOKEN_KW_FALSE, // 13
MP_TOKEN_KW_NONE, MP_TOKEN_KW_NONE,
MP_TOKEN_KW_TRUE, MP_TOKEN_KW_TRUE,
MP_TOKEN_KW_AND, MP_TOKEN_KW_AND,
@ -31,7 +32,7 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_KW_BREAK, MP_TOKEN_KW_BREAK,
MP_TOKEN_KW_CLASS, MP_TOKEN_KW_CLASS,
MP_TOKEN_KW_CONTINUE, MP_TOKEN_KW_CONTINUE,
MP_TOKEN_KW_DEF, // 21 MP_TOKEN_KW_DEF, // 22
MP_TOKEN_KW_DEL, MP_TOKEN_KW_DEL,
MP_TOKEN_KW_ELIF, MP_TOKEN_KW_ELIF,
MP_TOKEN_KW_ELSE, MP_TOKEN_KW_ELSE,
@ -41,7 +42,7 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_KW_FROM, MP_TOKEN_KW_FROM,
MP_TOKEN_KW_GLOBAL, MP_TOKEN_KW_GLOBAL,
MP_TOKEN_KW_IF, MP_TOKEN_KW_IF,
MP_TOKEN_KW_IMPORT, // 31 MP_TOKEN_KW_IMPORT, // 32
MP_TOKEN_KW_IN, MP_TOKEN_KW_IN,
MP_TOKEN_KW_IS, MP_TOKEN_KW_IS,
MP_TOKEN_KW_LAMBDA, MP_TOKEN_KW_LAMBDA,
@ -51,12 +52,12 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_KW_PASS, MP_TOKEN_KW_PASS,
MP_TOKEN_KW_RAISE, MP_TOKEN_KW_RAISE,
MP_TOKEN_KW_RETURN, MP_TOKEN_KW_RETURN,
MP_TOKEN_KW_TRY, // 41 MP_TOKEN_KW_TRY, // 42
MP_TOKEN_KW_WHILE, MP_TOKEN_KW_WHILE,
MP_TOKEN_KW_WITH, MP_TOKEN_KW_WITH,
MP_TOKEN_KW_YIELD, MP_TOKEN_KW_YIELD,
MP_TOKEN_OP_PLUS, // 45 MP_TOKEN_OP_PLUS, // 46
MP_TOKEN_OP_MINUS, MP_TOKEN_OP_MINUS,
MP_TOKEN_OP_STAR, MP_TOKEN_OP_STAR,
MP_TOKEN_OP_DBL_STAR, MP_TOKEN_OP_DBL_STAR,
@ -66,7 +67,7 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS,
MP_TOKEN_OP_DBL_LESS, MP_TOKEN_OP_DBL_LESS,
MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE,
MP_TOKEN_OP_DBL_MORE, // 55 MP_TOKEN_OP_DBL_MORE, // 56
MP_TOKEN_OP_AMPERSAND, MP_TOKEN_OP_AMPERSAND,
MP_TOKEN_OP_PIPE, MP_TOKEN_OP_PIPE,
MP_TOKEN_OP_CARET, MP_TOKEN_OP_CARET,
@ -76,7 +77,7 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_OP_DBL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
MP_TOKEN_OP_NOT_EQUAL, MP_TOKEN_OP_NOT_EQUAL,
MP_TOKEN_DEL_PAREN_OPEN, // 64 MP_TOKEN_DEL_PAREN_OPEN, // 65
MP_TOKEN_DEL_PAREN_CLOSE, MP_TOKEN_DEL_PAREN_CLOSE,
MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_OPEN,
MP_TOKEN_DEL_BRACKET_CLOSE, MP_TOKEN_DEL_BRACKET_CLOSE,
@ -86,7 +87,7 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_COLON,
MP_TOKEN_DEL_PERIOD, MP_TOKEN_DEL_PERIOD,
MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_SEMICOLON,
MP_TOKEN_DEL_AT, // 74 MP_TOKEN_DEL_AT, // 75
MP_TOKEN_DEL_EQUAL, MP_TOKEN_DEL_EQUAL,
MP_TOKEN_DEL_PLUS_EQUAL, MP_TOKEN_DEL_PLUS_EQUAL,
MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_EQUAL,
@ -96,7 +97,7 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_DEL_PERCENT_EQUAL, MP_TOKEN_DEL_PERCENT_EQUAL,
MP_TOKEN_DEL_AMPERSAND_EQUAL, MP_TOKEN_DEL_AMPERSAND_EQUAL,
MP_TOKEN_DEL_PIPE_EQUAL, MP_TOKEN_DEL_PIPE_EQUAL,
MP_TOKEN_DEL_CARET_EQUAL, // 84 MP_TOKEN_DEL_CARET_EQUAL, // 85
MP_TOKEN_DEL_DBL_MORE_EQUAL, MP_TOKEN_DEL_DBL_MORE_EQUAL,
MP_TOKEN_DEL_DBL_LESS_EQUAL, MP_TOKEN_DEL_DBL_LESS_EQUAL,
MP_TOKEN_DEL_DBL_STAR_EQUAL, MP_TOKEN_DEL_DBL_STAR_EQUAL,

View File

@ -88,6 +88,7 @@ typedef struct _parser_t {
uint rule_stack_top; uint rule_stack_top;
rule_stack_t *rule_stack; rule_stack_t *rule_stack;
uint result_stack_alloc;
uint result_stack_top; uint result_stack_top;
mp_parse_node_t *result_stack; mp_parse_node_t *result_stack;
} parser_t; } parser_t;
@ -121,7 +122,7 @@ mp_parse_node_t mp_parse_node_new_leaf(machine_int_t kind, machine_int_t arg) {
int num_parse_nodes_allocated = 0; int num_parse_nodes_allocated = 0;
mp_parse_node_struct_t *parse_node_new_struct(int rule_id, int num_args) { mp_parse_node_struct_t *parse_node_new_struct(int rule_id, int num_args) {
mp_parse_node_struct_t *pn = m_malloc(sizeof(mp_parse_node_struct_t) + num_args * sizeof(mp_parse_node_t)); mp_parse_node_struct_t *pn = m_new_obj_var(mp_parse_node_struct_t, mp_parse_node_t, num_args);
pn->source = 0; // TODO pn->source = 0; // TODO
pn->kind_num_nodes = (rule_id & 0xff) | (num_args << 8); pn->kind_num_nodes = (rule_id & 0xff) | (num_args << 8);
num_parse_nodes_allocated += 1; num_parse_nodes_allocated += 1;
@ -180,6 +181,10 @@ static mp_parse_node_t peek_result(parser_t *parser, int pos) {
} }
static void push_result_node(parser_t *parser, mp_parse_node_t pn) { static void push_result_node(parser_t *parser, mp_parse_node_t pn) {
if (parser->result_stack_top >= parser->result_stack_alloc) {
parser->result_stack = m_renew(mp_parse_node_t, parser->result_stack, parser->result_stack_alloc, parser->result_stack_alloc * 2);
parser->result_stack_alloc *= 2;
}
parser->result_stack[parser->result_stack_top++] = pn; parser->result_stack[parser->result_stack_top++] = pn;
} }
@ -252,14 +257,20 @@ static void push_result_rule(parser_t *parser, const rule_t *rule, int num_args)
} }
mp_parse_node_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) { mp_parse_node_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) {
parser_t *parser = m_new(parser_t, 1);
// allocate memory for the parser and its stacks
parser_t *parser = m_new_obj(parser_t);
parser->rule_stack_alloc = 64; parser->rule_stack_alloc = 64;
parser->rule_stack_top = 0; parser->rule_stack_top = 0;
parser->rule_stack = m_new(rule_stack_t, parser->rule_stack_alloc); parser->rule_stack = m_new(rule_stack_t, parser->rule_stack_alloc);
parser->result_stack = m_new(mp_parse_node_t, 1000); parser->result_stack_alloc = 64;
parser->result_stack_top = 0; parser->result_stack_top = 0;
parser->result_stack = m_new(mp_parse_node_t, parser->result_stack_alloc);
// work out the top-level rule to use, and push it on the stack
int top_level_rule; int top_level_rule;
switch (input_kind) { switch (input_kind) {
case MP_PARSE_SINGLE_INPUT: top_level_rule = RULE_single_input; break; case MP_PARSE_SINGLE_INPUT: top_level_rule = RULE_single_input; break;
@ -268,6 +279,8 @@ mp_parse_node_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) {
} }
push_rule(parser, rules[top_level_rule], 0); push_rule(parser, rules[top_level_rule], 0);
// parse!
uint n, i; uint n, i;
bool backtrack = false; bool backtrack = false;
const rule_t *rule; const rule_t *rule;
@ -558,12 +571,25 @@ mp_parse_node_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) {
//printf("--------------\n"); //printf("--------------\n");
//result_stack_show(parser); //result_stack_show(parser);
assert(parser->result_stack_top == 1); //printf("rule stack alloc: %d\n", parser->rule_stack_alloc);
//printf("maximum depth: %d\n", parser->rule_stack_alloc); //printf("result stack alloc: %d\n", parser->result_stack_alloc);
//printf("number of parse nodes allocated: %d\n", num_parse_nodes_allocated); //printf("number of parse nodes allocated: %d\n", num_parse_nodes_allocated);
return parser->result_stack[0];
// get the root parse node that we created
assert(parser->result_stack_top == 1);
mp_parse_node_t result = parser->result_stack[0];
finished:
// free the memory that we don't need anymore
m_del(rule_stack_t, parser->rule_stack, parser->rule_stack_alloc);
m_del(mp_parse_node_t, parser->result_stack, parser->result_stack_alloc);
m_del_obj(parser_t, parser);
// return the result
return result;
syntax_error: syntax_error:
// TODO these should raise a proper exception
if (mp_lexer_is_kind(lex, MP_TOKEN_INDENT)) { if (mp_lexer_is_kind(lex, MP_TOKEN_INDENT)) {
mp_lexer_show_error_pythonic(lex, "IndentationError: unexpected indent"); mp_lexer_show_error_pythonic(lex, "IndentationError: unexpected indent");
} else if (mp_lexer_is_kind(lex, MP_TOKEN_DEDENT_MISMATCH)) { } else if (mp_lexer_is_kind(lex, MP_TOKEN_DEDENT_MISMATCH)) {
@ -575,5 +601,6 @@ syntax_error:
#endif #endif
mp_token_show(mp_lexer_cur(lex)); mp_token_show(mp_lexer_cur(lex));
} }
return MP_PARSE_NODE_NULL; result = MP_PARSE_NODE_NULL;
goto finished;
} }