From 4c81ba8015238a343593468aa5173440fd392e32 Mon Sep 17 00:00:00 2001 From: Damien George Date: Tue, 13 Jan 2015 16:21:23 +0000 Subject: [PATCH] py: Never intern data of large string/bytes object; add relevant tests. Previously to this patch all constant string/bytes objects were interned by the compiler, and this lead to crashes when the qstr was too long (noticeable now that qstr length storage defaults to 1 byte). With this patch, long string/bytes objects are never interned, and are referenced directly as constant objects within generated code using load_const_obj. --- py/compile.c | 32 +++++++++++++++++++++----------- py/parse.c | 21 +++++++++++---------- py/qstr.c | 1 + tests/basics/bytes_large.py | 2 ++ tests/basics/string_large.py | 2 ++ 5 files changed, 37 insertions(+), 21 deletions(-) create mode 100644 tests/basics/bytes_large.py create mode 100644 tests/basics/string_large.py diff --git a/py/compile.c b/py/compile.c index 3ee78bbfc7..fe0e4cb5d2 100644 --- a/py/compile.c +++ b/py/compile.c @@ -46,6 +46,7 @@ typedef enum { #undef DEF_RULE PN_maximum_number_of, PN_string, // special node for non-interned string + PN_bytes, // special node for non-interned bytes } pn_kind_t; #define EMIT(fun) (comp->emit_method_table->fun(comp->emit)) @@ -172,6 +173,7 @@ STATIC mp_parse_node_t fold_constants(compiler_t *comp, mp_parse_node_t pn, mp_m break; #endif case PN_string: + case PN_bytes: return pn; } @@ -427,6 +429,9 @@ STATIC bool cpython_c_tuple_is_const(mp_parse_node_t pn) { if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) { return true; } + if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_bytes)) { + return true; + } if (!MP_PARSE_NODE_IS_LEAF(pn)) { return false; } @@ -475,9 +480,9 @@ STATIC void cpython_c_print_quoted_str(vstr_t *vstr, const char *str, uint len, } STATIC void cpython_c_tuple_emit_const(compiler_t *comp, mp_parse_node_t pn, vstr_t *vstr) { - if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) { + if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string) || MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_bytes)) { mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn; - cpython_c_print_quoted_str(vstr, (const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1], false); + cpython_c_print_quoted_str(vstr, (const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1], MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_bytes)); return; } @@ -2151,7 +2156,8 @@ STATIC void compile_expr_stmt(compiler_t *comp, mp_parse_node_struct_t *pns) { } else { // for non-REPL, evaluate then discard the expression if ((MP_PARSE_NODE_IS_LEAF(pns->nodes[0]) && !MP_PARSE_NODE_IS_ID(pns->nodes[0])) - || MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string)) { + || MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string) + || MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_bytes)) { // do nothing with a lonely constant } else { compile_node(comp, pns->nodes[0]); // just an expression @@ -2595,8 +2601,12 @@ STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) { } else { assert(MP_PARSE_NODE_IS_STRUCT(pns->nodes[i])); mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i]; - assert(MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_string); - pn_kind = MP_PARSE_NODE_STRING; + if (MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_string) { + pn_kind = MP_PARSE_NODE_STRING; + } else { + assert(MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_bytes); + pn_kind = MP_PARSE_NODE_BYTES; + } n_bytes += (mp_uint_t)pns_string->nodes[1]; } if (i == 0) { @@ -2608,8 +2618,8 @@ STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) { } // concatenate string/bytes - byte *q_ptr; - byte *s_dest = qstr_build_start(n_bytes, &q_ptr); + byte *s_dest; + mp_obj_t obj = mp_obj_str_builder_start(string_kind == MP_PARSE_NODE_STRING ? &mp_type_str : &mp_type_bytes, n_bytes, &s_dest); for (int i = 0; i < n; i++) { if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) { mp_uint_t s_len; @@ -2622,9 +2632,7 @@ STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) { s_dest += (mp_uint_t)pns_string->nodes[1]; } } - qstr q = qstr_build_end(q_ptr); - - EMIT_ARG(load_const_str, q, string_kind == MP_PARSE_NODE_BYTES); + EMIT_ARG(load_const_obj, mp_obj_str_builder_end(obj)); } // pns needs to have 2 nodes, first is lhs of comprehension, second is PN_comp_for node @@ -2959,7 +2967,9 @@ STATIC void compile_node(compiler_t *comp, mp_parse_node_t pn) { mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn; EMIT_ARG(set_line_number, pns->source_line); if (MP_PARSE_NODE_STRUCT_KIND(pns) == PN_string) { - EMIT_ARG(load_const_str, qstr_from_strn((const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1]), false); + EMIT_ARG(load_const_obj, mp_obj_new_str((const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1], false)); + } else if (MP_PARSE_NODE_STRUCT_KIND(pns) == PN_bytes) { + EMIT_ARG(load_const_obj, mp_obj_new_bytes((const byte*)pns->nodes[0], (mp_uint_t)pns->nodes[1])); } else { compile_function_t f = compile_function[MP_PARSE_NODE_STRUCT_KIND(pns)]; if (f == NULL) { diff --git a/py/parse.c b/py/parse.c index c7c85e0357..6d2108dcaa 100644 --- a/py/parse.c +++ b/py/parse.c @@ -70,6 +70,7 @@ enum { #undef DEF_RULE RULE_maximum_number_of, RULE_string, // special node for non-interned string + RULE_bytes, // special node for non-interned bytes }; #define ident (RULE_ACT_ALLOW_IDENT) @@ -176,7 +177,7 @@ void mp_parse_node_free(mp_parse_node_t pn) { mp_parse_node_struct_t *pns = (mp_parse_node_struct_t *)pn; mp_uint_t n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns); mp_uint_t rule_id = MP_PARSE_NODE_STRUCT_KIND(pns); - if (rule_id == RULE_string) { + if (rule_id == RULE_string || rule_id == RULE_bytes) { m_del(char, (char*)pns->nodes[0], (mp_uint_t)pns->nodes[1]); } else { bool adjust = ADD_BLANK_NODE(rules[rule_id]); @@ -225,6 +226,8 @@ void mp_parse_node_print(mp_parse_node_t pn, mp_uint_t indent) { mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn; if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_string) { printf("literal str(%.*s)\n", (int)pns->nodes[1], (char*)pns->nodes[0]); + } else if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_bytes) { + printf("literal bytes(%.*s)\n", (int)pns->nodes[1], (char*)pns->nodes[0]); } else { mp_uint_t n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns); #ifdef USE_RULE_NAME @@ -281,14 +284,14 @@ STATIC void push_result_node(parser_t *parser, mp_parse_node_t pn) { parser->result_stack[parser->result_stack_top++] = pn; } -STATIC void push_result_string(parser_t *parser, mp_uint_t src_line, const char *str, mp_uint_t len) { +STATIC void push_result_string_bytes(parser_t *parser, mp_uint_t src_line, mp_uint_t rule_kind, const char *str, mp_uint_t len) { mp_parse_node_struct_t *pn = m_new_obj_var_maybe(mp_parse_node_struct_t, mp_parse_node_t, 2); if (pn == NULL) { memory_error(parser); return; } pn->source_line = src_line; - pn->kind_num_nodes = RULE_string | (2 << 8); + pn->kind_num_nodes = rule_kind | (2 << 8); char *p = m_new(char, len); memcpy(p, str, len); pn->nodes[0] = (mp_int_t)p; @@ -340,8 +343,8 @@ STATIC void push_result_token(parser_t *parser) { } else { pn = mp_parse_node_new_leaf(MP_PARSE_NODE_INTEGER, qstr_from_strn(str, len)); } - } else if (lex->tok_kind == MP_TOKEN_STRING) { - // Don't automatically intern all strings. doc strings (which are usually large) + } else if (lex->tok_kind == MP_TOKEN_STRING || lex->tok_kind == MP_TOKEN_BYTES) { + // Don't automatically intern all strings/bytes. doc strings (which are usually large) // will be discarded by the compiler, and so we shouldn't intern them. qstr qst = MP_QSTR_NULL; if (lex->vstr.len <= MICROPY_ALLOC_PARSE_INTERN_STRING_LEN) { @@ -353,14 +356,12 @@ STATIC void push_result_token(parser_t *parser) { } if (qst != MP_QSTR_NULL) { // qstr exists, make a leaf node - pn = mp_parse_node_new_leaf(MP_PARSE_NODE_STRING, qst); + pn = mp_parse_node_new_leaf(lex->tok_kind == MP_TOKEN_STRING ? MP_PARSE_NODE_STRING : MP_PARSE_NODE_BYTES, qst); } else { - // not interned, make a node holding a pointer to the string data - push_result_string(parser, lex->tok_line, lex->vstr.buf, lex->vstr.len); + // not interned, make a node holding a pointer to the string/bytes data + push_result_string_bytes(parser, lex->tok_line, lex->tok_kind == MP_TOKEN_STRING ? RULE_string : RULE_bytes, lex->vstr.buf, lex->vstr.len); return; } - } else if (lex->tok_kind == MP_TOKEN_BYTES) { - pn = mp_parse_node_new_leaf(MP_PARSE_NODE_BYTES, qstr_from_strn(lex->vstr.buf, lex->vstr.len)); } else { pn = mp_parse_node_new_leaf(MP_PARSE_NODE_TOKEN, lex->tok_kind); } diff --git a/py/qstr.c b/py/qstr.c index c244738a5e..2553872b16 100644 --- a/py/qstr.c +++ b/py/qstr.c @@ -148,6 +148,7 @@ qstr qstr_from_str(const char *str) { } qstr qstr_from_strn(const char *str, mp_uint_t len) { + assert(len < (1 << (8 * MICROPY_QSTR_BYTES_IN_LEN))); qstr q = qstr_find_strn(str, len); if (q == 0) { mp_uint_t hash = qstr_compute_hash((const byte*)str, len); diff --git a/tests/basics/bytes_large.py b/tests/basics/bytes_large.py new file mode 100644 index 0000000000..9b6b79bbb8 --- /dev/null +++ b/tests/basics/bytes_large.py @@ -0,0 +1,2 @@ +b1 = b"long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes" +b2 = b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" diff --git a/tests/basics/string_large.py b/tests/basics/string_large.py new file mode 100644 index 0000000000..90a4af250b --- /dev/null +++ b/tests/basics/string_large.py @@ -0,0 +1,2 @@ +s1 = "long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string" +s2 = "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string"