py: Improve encoding scheme for line-number to bytecode map.

Reduces by about a factor of 10 on average the amount of RAM needed to store the line-number to bytecode map in the bytecode prelude. Using CPython3.4's stdlib for statistics: previously, an average of 13 bytes were used per (bytecode offset, line-number offset) pair, and now with this improvement, that's down to 1.3 bytes on average. Large RAM usage before was due to some very large steps in line numbers, both from the start of the first line in a function way down in the file, and also functions that have big comments and/or big strings in them (both cases were significant). Although the savings are large on average for the CPython stdlib, it won't have such a big effect for small scripts used in embedded programming. Addresses issue #648.
2014-07-31 16:12:01 +00:00 · 2014-07-31 16:12:01 +00:00 · 4747becc64
commit 4747becc64
--- a/py/emitbc.c
+++ b/py/emitbc.c
@ -115,12 +115,24 @@ STATIC void emit_write_code_info_qstr(emit_t* emit, qstr qstr) {
 #if MICROPY_ENABLE_SOURCE_LINE
 STATIC void emit_write_code_info_bytes_lines(emit_t* emit, uint bytes_to_skip, uint lines_to_skip) {
    assert(bytes_to_skip > 0 || lines_to_skip > 0);
+    //printf("  %d %d\n", bytes_to_skip, lines_to_skip);
    while (bytes_to_skip > 0 || lines_to_skip > 0) {
-        uint b = MIN(bytes_to_skip, 31);
-        uint l = MIN(lines_to_skip, 7);
+        mp_uint_t b, l;
+        if (lines_to_skip <= 6) {
+            // use 0b0LLBBBBB encoding
+            b = MIN(bytes_to_skip, 0x1f);
+            l = MIN(lines_to_skip, 0x3);
+            *emit_get_cur_to_write_code_info(emit, 1) = b | (l << 5);
+        } else {
+            // use 0b1LLLBBBB 0bLLLLLLLL encoding (l's LSB in second byte)
+            b = MIN(bytes_to_skip, 0xf);
+            l = MIN(lines_to_skip, 0x7ff);
+            byte *ci = emit_get_cur_to_write_code_info(emit, 2);
+            ci[0] = 0x80 | b | ((l >> 4) & 0x70);
+            ci[1] = l;
+        }
        bytes_to_skip -= b;
        lines_to_skip -= l;
-        *emit_get_cur_to_write_code_info(emit, 1) = b | (l << 5);
    }
 }
 #endif
@ -363,7 +375,6 @@ STATIC void emit_bc_set_source_line(emit_t *emit, int source_line) {
        uint bytes_to_skip = emit->bytecode_offset - emit->last_source_line_offset;
        uint lines_to_skip = source_line - emit->last_source_line;
        emit_write_code_info_bytes_lines(emit, bytes_to_skip, lines_to_skip);
-        //printf("  %d %d\n", bytes_to_skip, lines_to_skip);
        emit->last_source_line_offset = emit->bytecode_offset;
        emit->last_source_line = source_line;
    }
--- a/py/showbc.c
+++ b/py/showbc.c
@ -95,9 +95,18 @@ void mp_bytecode_print(const void *descr, const byte *ip, int len) {
        mp_int_t bc = (code_info + code_info_size) - ip;
        mp_uint_t source_line = 1;
        printf("  bc=" INT_FMT " line=" UINT_FMT "\n", bc, source_line);
-        for (const byte* ci = code_info + 12; *ci; ci++) {
-            bc += *ci & 31;
-            source_line += *ci >> 5;
+        for (const byte* ci = code_info + 12; *ci;) {
+            if ((ci[0] & 0x80) == 0) {
+                // 0b0LLBBBBB encoding
+                bc += ci[0] & 0x1f;
+                source_line += ci[0] >> 5;
+                ci += 1;
+            } else {
+                // 0b1LLLBBBB 0bLLLLLLLL encoding (l's LSB in second byte)
+                bc += ci[0] & 0xf;
+                source_line += ((ci[0] << 4) & 0x700) | ci[1];
+                ci += 2;
+            }
            printf("  bc=" INT_FMT " line=" UINT_FMT "\n", bc, source_line);
        }
    }
--- a/py/vm.c
+++ b/py/vm.c
@ -931,9 +931,27 @@ exception_handler:
                const byte* ci = code_info + 12;
                if (*ci) {
                    source_line = 1;
-                    for (; *ci && bc >= ((*ci) & 31); ci++) {
-                        bc -= *ci & 31;
-                        source_line += *ci >> 5;
+                    mp_uint_t c;
+                    while ((c = *ci)) {
+                        mp_uint_t b, l;
+                        if ((c & 0x80) == 0) {
+                            // 0b0LLBBBBB encoding
+                            b = c & 0x1f;
+                            l = c >> 5;
+                            ci += 1;
+                        } else {
+                            // 0b1LLLBBBB 0bLLLLLLLL encoding (l's LSB in second byte)
+                            b = c & 0xf;
+                            l = ((c << 4) & 0x700) | ci[1];
+                            ci += 2;
+                        }
+                        if (bc >= b) {
+                            bc -= b;
+                            source_line += l;
+                        } else {
+                            // found source line corresponding to bytecode offset
+                            break;
+                        }
                    }
                }
                mp_obj_exception_add_traceback(nlr.ret_val, source_file, source_line, block_name);