From 1fb01bd6c5dc350f3c617ca8edae8dea9e5516ae Mon Sep 17 00:00:00 2001 From: Damien George Date: Tue, 10 May 2022 13:56:24 +1000 Subject: [PATCH] py/emitnative: Put a pointer to the native prelude in child_table array. Some architectures (like esp32 xtensa) cannot read byte-wise from executable memory. This means the prelude for native functions -- which is usually located after the machine code for the native function -- must be placed in separate memory that can be read byte-wise. Prior to this commit this was achieved by enabling N_PRELUDE_AS_BYTES_OBJ for the emitter and MICROPY_EMIT_NATIVE_PRELUDE_AS_BYTES_OBJ for the runtime. The prelude was then placed in a bytes object, pointed to by the module's constant table. This behaviour is changed by this commit so that a pointer to the prelude is stored either in mp_obj_fun_bc_t.child_table, or in mp_obj_fun_bc_t.child_table[num_children] if num_children > 0. The reasons for doing this are: 1. It decouples the native emitter from runtime requirements, the emitted code no longer needs to know if the system it runs on can/can't read byte-wise from executable memory. 2. It makes all ports have the same emitter behaviour, there is no longer the N_PRELUDE_AS_BYTES_OBJ option. 3. The module's constant table is now used only for actual constants in the Python code. This allows further optimisations to be done with the constants (eg constant deduplication). Code size change for those ports that enable the native emitter: unix x64: +80 +0.015% stm32: +24 +0.004% PYBV10 esp8266: +88 +0.013% GENERIC esp32: -20 -0.002% GENERIC[incl -112(data)] rp2: +32 +0.005% PICO Signed-off-by: Damien George --- py/compile.c | 8 ---- py/emitnative.c | 93 +++++++++++++++++---------------------------- py/emitnxtensawin.c | 1 - py/mpconfig.h | 6 ++- py/objgenerator.c | 15 ++++---- py/persistentcode.c | 24 +++++++++++- tools/mpy-tool.py | 29 ++++++++++++-- 7 files changed, 93 insertions(+), 83 deletions(-) diff --git a/py/compile.c b/py/compile.c index 6bb601b925..2760d8bfb9 100644 --- a/py/compile.c +++ b/py/compile.c @@ -3031,14 +3031,6 @@ STATIC bool compile_scope(compiler_t *comp, scope_t *scope, pass_kind_t pass) { // they will be computed in this first pass scope->stack_size = 0; scope->exc_stack_size = 0; - - #if MICROPY_EMIT_NATIVE - if (scope->emit_options == MP_EMIT_OPT_NATIVE_PYTHON || scope->emit_options == MP_EMIT_OPT_VIPER) { - // allow native code to perfom basic tasks during the pass scope - // note: the first argument passed here is mp_emit_common_t, not the native emitter context - NATIVE_EMITTER_TABLE->start_pass((void *)&comp->emit_common, comp->pass, scope); - } - #endif } // compile diff --git a/py/emitnative.c b/py/emitnative.c index bfb37ef4f7..ddc0b5b97b 100644 --- a/py/emitnative.c +++ b/py/emitnative.c @@ -231,9 +231,7 @@ struct _emit_t { exc_stack_entry_t *exc_stack; int prelude_offset; - #if N_PRELUDE_AS_BYTES_OBJ - size_t prelude_const_table_offset; - #endif + int prelude_ptr_index; int start_offset; int n_state; uint16_t code_state_start; @@ -349,16 +347,6 @@ STATIC void emit_native_mov_reg_qstr_obj(emit_t *emit, int reg_dest, qstr qst) { STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scope) { DEBUG_printf("start_pass(pass=%u, scope=%p)\n", pass, scope); - if (pass == MP_PASS_SCOPE) { - // Note: the first argument passed here is mp_emit_common_t, not the native emitter context - #if N_PRELUDE_AS_BYTES_OBJ - if (scope->emit_options == MP_EMIT_OPT_NATIVE_PYTHON) { - mp_emit_common_alloc_const_obj((mp_emit_common_t *)emit, mp_const_none); - } - #endif - return; - } - emit->pass = pass; emit->do_viper_types = scope->emit_options == MP_EMIT_OPT_VIPER; emit->stack_size = 0; @@ -511,12 +499,7 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop if (emit->scope->scope_flags & MP_SCOPE_FLAG_GENERATOR) { emit->code_state_start = 0; emit->stack_start = SIZEOF_CODE_STATE; - #if N_PRELUDE_AS_BYTES_OBJ - // Load index of prelude bytes object in const_table - mp_asm_base_data(&emit->as->base, ASM_WORD_SIZE, (uintptr_t)emit->prelude_const_table_offset); - #else - mp_asm_base_data(&emit->as->base, ASM_WORD_SIZE, (uintptr_t)emit->prelude_offset); - #endif + mp_asm_base_data(&emit->as->base, ASM_WORD_SIZE, (uintptr_t)emit->prelude_ptr_index); mp_asm_base_data(&emit->as->base, ASM_WORD_SIZE, (uintptr_t)emit->start_offset); ASM_ENTRY(emit->as, SIZEOF_NLR_BUF); @@ -562,41 +545,19 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop // Set code_state.fun_bc ASM_MOV_LOCAL_REG(emit->as, LOCAL_IDX_FUN_OBJ(emit), REG_PARENT_ARG_1); - // Set code_state.ip, a pointer to the beginning of the prelude + // Set code_state.ip, a pointer to the beginning of the prelude. This pointer is found + // either directly in mp_obj_fun_bc_t.child_table (if there are no children), or in + // mp_obj_fun_bc_t.child_table[num_children] (if num_children > 0). // Need to use some locals for this, so assert that they are available for use MP_STATIC_ASSERT(REG_LOCAL_3 != REG_PARENT_ARG_1); MP_STATIC_ASSERT(REG_LOCAL_3 != REG_PARENT_ARG_2); MP_STATIC_ASSERT(REG_LOCAL_3 != REG_PARENT_ARG_3); MP_STATIC_ASSERT(REG_LOCAL_3 != REG_PARENT_ARG_4); - int code_state_ip_local = emit->code_state_start + OFFSETOF_CODE_STATE_IP; - #if N_PRELUDE_AS_BYTES_OBJ - // Prelude is a bytes object in const_table[prelude_const_table_offset]. - ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_PARENT_ARG_1, OFFSETOF_OBJ_FUN_BC_CONTEXT); - ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_LOCAL_3, OFFSETOF_MODULE_CONTEXT_OBJ_TABLE); - ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_LOCAL_3, emit->prelude_const_table_offset); - ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_LOCAL_3, offsetof(mp_obj_str_t, data) / sizeof(uintptr_t)); - #else - MP_STATIC_ASSERT(REG_LOCAL_2 != REG_PARENT_ARG_1); - MP_STATIC_ASSERT(REG_LOCAL_2 != REG_PARENT_ARG_2); - MP_STATIC_ASSERT(REG_LOCAL_2 != REG_PARENT_ARG_3); - MP_STATIC_ASSERT(REG_LOCAL_2 != REG_PARENT_ARG_4); - // Prelude is at the end of the machine code - ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_PARENT_ARG_1, OFFSETOF_OBJ_FUN_BC_BYTECODE); - if (emit->pass == MP_PASS_CODE_SIZE) { - // Commit to the encoding size based on the value of prelude_offset in this pass. - // By using 32768 as the cut-off it is highly unlikely that prelude_offset will - // grow beyond 65535 by the end of thiss pass, and so require the larger encoding. - emit->prelude_offset_uses_u16_encoding = emit->prelude_offset < 32768; + ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_PARENT_ARG_1, OFFSETOF_OBJ_FUN_BC_CHILD_TABLE); + if (emit->prelude_ptr_index != 0) { + ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_LOCAL_3, emit->prelude_ptr_index); } - if (emit->prelude_offset_uses_u16_encoding) { - assert(emit->prelude_offset <= 65535); - ASM_MOV_REG_IMM_FIX_U16((emit)->as, REG_LOCAL_2, emit->prelude_offset); - } else { - ASM_MOV_REG_IMM_FIX_WORD((emit)->as, REG_LOCAL_2, emit->prelude_offset); - } - ASM_ADD_REG_REG(emit->as, REG_LOCAL_3, REG_LOCAL_2); - #endif - emit_native_mov_state_reg(emit, code_state_ip_local, REG_LOCAL_3); + emit_native_mov_state_reg(emit, emit->code_state_start + OFFSETOF_CODE_STATE_IP, REG_LOCAL_3); // Set code_state.n_state (only works on little endian targets due to n_state being uint16_t) emit_native_mov_state_imm_via(emit, emit->code_state_start + OFFSETOF_CODE_STATE_N_STATE, emit->n_state, REG_ARG_1); @@ -657,6 +618,7 @@ STATIC bool emit_native_end_pass(emit_t *emit) { if (!emit->do_viper_types) { emit->prelude_offset = mp_asm_base_get_code_pos(&emit->as->base); + emit->prelude_ptr_index = emit->emit_common->ct_cur_child; size_t n_state = emit->n_state; size_t n_exc_stack = 0; // exc-stack not needed for native code @@ -693,16 +655,6 @@ STATIC bool emit_native_end_pass(emit_t *emit) { } emit->n_cell = mp_asm_base_get_code_pos(&emit->as->base) - cell_start; - #if N_PRELUDE_AS_BYTES_OBJ - // Create the prelude as a bytes object, and store it in the constant table - mp_obj_t prelude = mp_const_none; - if (emit->pass == MP_PASS_EMIT) { - void *buf = emit->as->base.code_base + emit->prelude_offset; - size_t n = emit->as->base.code_offset - emit->prelude_offset; - prelude = mp_obj_new_bytes(buf, n); - } - emit->prelude_const_table_offset = mp_emit_common_alloc_const_obj(emit->emit_common, prelude); - #endif } ASM_END_PASS(emit->as); @@ -725,10 +677,33 @@ STATIC bool emit_native_end_pass(emit_t *emit) { void *f = mp_asm_base_get_code(&emit->as->base); mp_uint_t f_len = mp_asm_base_get_code_size(&emit->as->base); + mp_raw_code_t **children = emit->emit_common->children; + if (!emit->do_viper_types) { + #if MICROPY_EMIT_NATIVE_PRELUDE_SEPARATE_FROM_MACHINE_CODE + // Executable code cannot be accessed byte-wise on this architecture, so copy + // the prelude to a separate memory region that is byte-wise readable. + void *buf = emit->as->base.code_base + emit->prelude_offset; + size_t n = emit->as->base.code_offset - emit->prelude_offset; + const uint8_t *prelude_ptr = memcpy(m_new(uint8_t, n), buf, n); + #else + // Point to the prelude directly, at the end of the machine code data. + const uint8_t *prelude_ptr = (const uint8_t *)f + emit->prelude_offset; + #endif + + // Store the pointer to the prelude using the child_table. + assert(emit->prelude_ptr_index == emit->emit_common->ct_cur_child); + if (emit->prelude_ptr_index == 0) { + children = (void *)prelude_ptr; + } else { + children = m_renew(mp_raw_code_t *, children, emit->prelude_ptr_index, emit->prelude_ptr_index + 1); + children[emit->prelude_ptr_index] = (void *)prelude_ptr; + } + } + mp_emit_glue_assign_native(emit->scope->raw_code, emit->do_viper_types ? MP_CODE_NATIVE_VIPER : MP_CODE_NATIVE_PY, f, f_len, - emit->emit_common->children, + children, #if MICROPY_PERSISTENT_CODE_SAVE emit->emit_common->ct_cur_child, emit->prelude_offset, diff --git a/py/emitnxtensawin.c b/py/emitnxtensawin.c index 38d5db13ea..99aac08dc4 100644 --- a/py/emitnxtensawin.c +++ b/py/emitnxtensawin.c @@ -15,7 +15,6 @@ #define NLR_BUF_IDX_LOCAL_3 (2 + 6) // a6 #define N_NLR_SETJMP (1) -#define N_PRELUDE_AS_BYTES_OBJ (1) #define N_XTENSAWIN (1) #define EXPORT_FUN(name) emit_native_xtensawin_##name #include "py/emitnative.c" diff --git a/py/mpconfig.h b/py/mpconfig.h index f9894b497c..59ab84cd4c 100644 --- a/py/mpconfig.h +++ b/py/mpconfig.h @@ -394,8 +394,10 @@ // Convenience definition for whether any native emitter is enabled #define MICROPY_EMIT_NATIVE (MICROPY_EMIT_X64 || MICROPY_EMIT_X86 || MICROPY_EMIT_THUMB || MICROPY_EMIT_ARM || MICROPY_EMIT_XTENSA || MICROPY_EMIT_XTENSAWIN) -// Select prelude-as-bytes-object for certain emitters -#define MICROPY_EMIT_NATIVE_PRELUDE_AS_BYTES_OBJ (MICROPY_EMIT_XTENSAWIN) +// Some architectures cannot read byte-wise from executable memory. In this case +// the prelude for a native function (which usually sits after the machine code) +// must be separated and placed somewhere where it can be read byte-wise. +#define MICROPY_EMIT_NATIVE_PRELUDE_SEPARATE_FROM_MACHINE_CODE (MICROPY_EMIT_XTENSAWIN) // Convenience definition for whether any inline assembler emitter is enabled #define MICROPY_EMIT_INLINE_ASM (MICROPY_EMIT_INLINE_THUMB || MICROPY_EMIT_INLINE_XTENSA) diff --git a/py/objgenerator.c b/py/objgenerator.c index 12fd81a8bb..802fd45bbd 100644 --- a/py/objgenerator.c +++ b/py/objgenerator.c @@ -98,14 +98,13 @@ STATIC mp_obj_t native_gen_wrap_call(mp_obj_t self_in, size_t n_args, size_t n_k mp_obj_fun_bc_t *self_fun = MP_OBJ_TO_PTR(self_in); // Determine start of prelude. - uintptr_t prelude_offset = ((uintptr_t *)self_fun->bytecode)[0]; - #if MICROPY_EMIT_NATIVE_PRELUDE_AS_BYTES_OBJ - // Prelude is in bytes object in const_table, at index prelude_offset - mp_obj_str_t *prelude_bytes = MP_OBJ_TO_PTR(self_fun->context->constants.obj_table[prelude_offset]); - const uint8_t *prelude_ptr = prelude_bytes->data; - #else - const uint8_t *prelude_ptr = self_fun->bytecode + prelude_offset; - #endif + uintptr_t prelude_ptr_index = ((uintptr_t *)self_fun->bytecode)[0]; + const uint8_t *prelude_ptr; + if (prelude_ptr_index == 0) { + prelude_ptr = (void *)self_fun->child_table; + } else { + prelude_ptr = (void *)self_fun->child_table[prelude_ptr_index]; + } // Extract n_state from the prelude. const uint8_t *ip = prelude_ptr; diff --git a/py/persistentcode.c b/py/persistentcode.c index 0b5da80774..e23c775d1f 100644 --- a/py/persistentcode.c +++ b/py/persistentcode.c @@ -348,7 +348,7 @@ STATIC mp_raw_code_t *load_raw_code(mp_reader_t *reader) { // Load children if any. if (has_children) { n_children = read_uint(reader); - children = m_new(mp_raw_code_t *, n_children); + children = m_new(mp_raw_code_t *, n_children + (kind == MP_CODE_NATIVE_PY)); for (size_t i = 0; i < n_children; ++i) { children[i] = load_raw_code(reader); } @@ -372,6 +372,17 @@ STATIC mp_raw_code_t *load_raw_code(mp_reader_t *reader) { #if MICROPY_EMIT_MACHINE_CODE } else { + const uint8_t *prelude_ptr; + #if MICROPY_EMIT_NATIVE_PRELUDE_SEPARATE_FROM_MACHINE_CODE + if (kind == MP_CODE_NATIVE_PY) { + // Executable code cannot be accessed byte-wise on this architecture, so copy + // the prelude to a separate memory region that is byte-wise readable. + void *buf = fun_data + prelude_offset; + size_t n = fun_data_len - prelude_offset; + prelude_ptr = memcpy(m_new(uint8_t, n), buf, n); + } + #endif + // Relocate and commit code to executable address space reloc_info_t ri = {reader, rodata, bss}; #if defined(MP_PLAT_COMMIT_EXEC) @@ -395,6 +406,17 @@ STATIC mp_raw_code_t *load_raw_code(mp_reader_t *reader) { } #endif + if (kind == MP_CODE_NATIVE_PY) { + #if !MICROPY_EMIT_NATIVE_PRELUDE_SEPARATE_FROM_MACHINE_CODE + prelude_ptr = fun_data + prelude_offset; + #endif + if (n_children == 0) { + children = (void *)prelude_ptr; + } else { + children[n_children] = (void *)prelude_ptr; + } + } + // Assign native code to raw code object mp_emit_glue_assign_native(rc, kind, fun_data, fun_data_len, diff --git a/tools/mpy-tool.py b/tools/mpy-tool.py index 3ebbdd1104..2974e357b0 100755 --- a/tools/mpy-tool.py +++ b/tools/mpy-tool.py @@ -824,7 +824,7 @@ class RawCode(object): for rc in self.children: rc.disassemble() - def freeze_children(self): + def freeze_children(self, prelude_ptr=None): # Freeze children and generate table of children. if len(self.children): for rc in self.children: @@ -834,10 +834,12 @@ class RawCode(object): print("static const mp_raw_code_t *const children_%s[] = {" % self.escaped_name) for rc in self.children: print(" &raw_code_%s," % rc.escaped_name) + if prelude_ptr: + print(" (void *)%s," % prelude_ptr) print("};") print() - def freeze_raw_code(self, qstr_links=(), type_sig=0): + def freeze_raw_code(self, prelude_ptr=None, qstr_links=(), type_sig=0): # Generate mp_raw_code_t. print("static const mp_raw_code_t raw_code_%s = {" % self.escaped_name) print(" .kind = %s," % RawCode.code_kind_str[self.code_kind]) @@ -849,6 +851,8 @@ class RawCode(object): print(" #endif") if len(self.children): print(" .children = (void *)&children_%s," % self.escaped_name) + elif prelude_ptr: + print(" .children = (void *)%s," % prelude_ptr) else: print(" .children = NULL,") print(" #if MICROPY_PERSISTENT_CODE_SAVE") @@ -1112,8 +1116,25 @@ class RawCodeNative(RawCode): print("};") - self.freeze_children() - self.freeze_raw_code(self.qstr_links, self.type_sig) + prelude_ptr = None + if self.code_kind == MP_CODE_NATIVE_PY: + prelude_ptr = "fun_data_%s_prelude_macro" % self.escaped_name + print("#if MICROPY_EMIT_NATIVE_PRELUDE_SEPARATE_FROM_MACHINE_CODE") + n = len(self.fun_data) - self.prelude_offset + print("static const byte fun_data_%s_prelude[%u] = {" % (self.escaped_name, n), end="") + for i in range(n): + print(" 0x%02x," % self.fun_data[self.prelude_offset + i], end="") + print("};") + print("#define %s &fun_data_%s_prelude[0]" % (prelude_ptr, self.escaped_name)) + print("#else") + print( + "#define %s &fun_data_%s[%u]" + % (prelude_ptr, self.escaped_name, self.prelude_offset) + ) + print("#endif") + + self.freeze_children(prelude_ptr) + self.freeze_raw_code(prelude_ptr, self.qstr_links, self.type_sig) class MPYSegment: