diff --git a/mpy-cross/mpconfigport.h b/mpy-cross/mpconfigport.h index 96d3726562..e7c8edf135 100644 --- a/mpy-cross/mpconfigport.h +++ b/mpy-cross/mpconfigport.h @@ -76,6 +76,7 @@ #define MICROPY_CPYTHON_COMPAT (1) #define MICROPY_USE_INTERNAL_PRINTF (0) +#define MICROPY_PY_FSTRINGS (1) #define MICROPY_PY_BUILTINS_STR_UNICODE (1) #if !(defined(MICROPY_GCREGS_SETJMP) || defined(__x86_64__) || defined(__i386__) || defined(__thumb2__) || defined(__thumb__) || defined(__arm__)) diff --git a/ports/unix/mpconfigport.h b/ports/unix/mpconfigport.h index d838f42b3f..de5a65ab73 100644 --- a/ports/unix/mpconfigport.h +++ b/ports/unix/mpconfigport.h @@ -88,6 +88,7 @@ #define MICROPY_PY_FUNCTION_ATTRS (1) #define MICROPY_PY_DESCRIPTORS (1) #define MICROPY_PY_DELATTR_SETATTR (1) +#define MICROPY_PY_FSTRINGS (1) #define MICROPY_PY_BUILTINS_STR_UNICODE (1) #define MICROPY_PY_BUILTINS_STR_CENTER (1) #define MICROPY_PY_BUILTINS_STR_PARTITION (1) diff --git a/ports/windows/mpconfigport.h b/ports/windows/mpconfigport.h index 6421c93bdb..ffcb5b105f 100644 --- a/ports/windows/mpconfigport.h +++ b/ports/windows/mpconfigport.h @@ -66,6 +66,7 @@ #define MICROPY_PY_FUNCTION_ATTRS (1) #define MICROPY_PY_DESCRIPTORS (1) #define MICROPY_PY_DELATTR_SETATTR (1) +#define MICROPY_PY_FSTRINGS (1) #define MICROPY_PY_BUILTINS_STR_UNICODE (1) #define MICROPY_PY_BUILTINS_STR_CENTER (1) #define MICROPY_PY_BUILTINS_STR_PARTITION (1) diff --git a/py/lexer.c b/py/lexer.c index 07ea2b96ab..ba118c9d2f 100644 --- a/py/lexer.c +++ b/py/lexer.c @@ -62,6 +62,12 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) { return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3; } +#if MICROPY_PY_FSTRINGS +STATIC bool is_char_or4(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4) { + return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4; +} +#endif + STATIC bool is_char_following(mp_lexer_t *lex, byte c) { return lex->chr1 == c; } @@ -105,7 +111,13 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) { STATIC bool is_string_or_bytes(mp_lexer_t *lex) { return is_char_or(lex, '\'', '\"') + #if MICROPY_PY_FSTRINGS + || (is_char_or4(lex, 'r', 'u', 'b', 'f') && is_char_following_or(lex, '\'', '\"')) + || (((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r')) + && is_char_following_following_or(lex, '\'', '\"'))) + #else || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"')) + #endif || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"')); } @@ -132,9 +144,35 @@ STATIC void next_char(mp_lexer_t *lex) { ++lex->column; } + // shift the input queue forward lex->chr0 = lex->chr1; lex->chr1 = lex->chr2; - lex->chr2 = lex->reader.readbyte(lex->reader.data); + + // and add the next byte from either the fstring args or the reader + #if MICROPY_PY_FSTRINGS + if (lex->fstring_args_idx) { + // if there are saved chars, then we're currently injecting fstring args + if (lex->fstring_args_idx < lex->fstring_args.len) { + lex->chr2 = lex->fstring_args.buf[lex->fstring_args_idx++]; + } else { + // no more fstring arg bytes + lex->chr2 = '\0'; + } + + if (lex->chr0 == '\0') { + // consumed all fstring data, restore saved input queue + lex->chr0 = lex->chr0_saved; + lex->chr1 = lex->chr1_saved; + lex->chr2 = lex->chr2_saved; + // stop consuming fstring arg data + vstr_reset(&lex->fstring_args); + lex->fstring_args_idx = 0; + } + } else + #endif + { + lex->chr2 = lex->reader.readbyte(lex->reader.data); + } if (lex->chr1 == '\r') { // CR is a new line, converted to LF @@ -272,7 +310,7 @@ STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) { return true; } -STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) { +STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) { // get first quoting character char quote_char = '\''; if (is_char(lex, '\"')) { @@ -293,12 +331,57 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) { } size_t n_closing = 0; + #if MICROPY_PY_FSTRINGS + if (is_fstring) { + // assume there's going to be interpolation, so prep the injection data + // fstring_args_idx==0 && len(fstring_args)>0 means we're extracting the args. + // only when fstring_args_idx>0 will we consume the arg data + // note: lex->fstring_args will be empty already (it's reset when finished) + vstr_add_str(&lex->fstring_args, ".format("); + } + #endif + while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) { if (is_char(lex, quote_char)) { n_closing += 1; vstr_add_char(&lex->vstr, CUR_CHAR(lex)); } else { n_closing = 0; + + #if MICROPY_PY_FSTRINGS + while (is_fstring && is_char(lex, '{')) { + next_char(lex); + if (is_char(lex, '{')) { + // "{{" is passed through unchanged to be handled by str.format + vstr_add_byte(&lex->vstr, '{'); + next_char(lex); + } else { + // remember the start of this argument (if we need it for f'{a=}'). + size_t i = lex->fstring_args.len; + // extract characters inside the { until we reach the + // format specifier or closing }. + // (MicroPython limitation) note: this is completely unaware of + // Python syntax and will not handle any expression containing '}' or ':'. + // e.g. f'{"}"}' or f'{foo({})}'. + while (!is_end(lex) && !is_char_or(lex, ':', '}')) { + // like the default case at the end of this function, stay 8-bit clean + vstr_add_byte(&lex->fstring_args, CUR_CHAR(lex)); + next_char(lex); + } + if (lex->fstring_args.buf[lex->fstring_args.len - 1] == '=') { + // if the last character of the arg was '=', then inject "arg=" before the '{'. + // f'{a=}' --> 'a={}'.format(a) + vstr_add_strn(&lex->vstr, lex->fstring_args.buf + i, lex->fstring_args.len - i); + // remove the trailing '=' + lex->fstring_args.len--; + } + // comma-separate args + vstr_add_byte(&lex->fstring_args, ','); + } + vstr_add_byte(&lex->vstr, '{'); + } + #endif + if (is_char(lex, '\\')) { next_char(lex); unichar c = CUR_CHAR(lex); @@ -451,6 +534,23 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) { } void mp_lexer_to_next(mp_lexer_t *lex) { + #if MICROPY_PY_FSTRINGS + if (lex->fstring_args.len && lex->fstring_args_idx == 0) { + // moving onto the next token means the literal string is complete. + // switch into injecting the format args. + vstr_add_byte(&lex->fstring_args, ')'); + lex->chr0_saved = lex->chr0; + lex->chr1_saved = lex->chr1; + lex->chr2_saved = lex->chr2; + lex->chr0 = lex->fstring_args.buf[0]; + lex->chr1 = lex->fstring_args.buf[1]; + lex->chr2 = lex->fstring_args.buf[2]; + // we've already extracted 3 chars, but setting this non-zero also + // means we'll start consuming the fstring data + lex->fstring_args_idx = 3; + } + #endif + // start new token text vstr_reset(&lex->vstr); @@ -506,6 +606,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) { do { // parse type codes bool is_raw = false; + bool is_fstring = false; mp_token_kind_t kind = MP_TOKEN_STRING; int n_char = 0; if (is_char(lex, 'u')) { @@ -524,7 +625,25 @@ void mp_lexer_to_next(mp_lexer_t *lex) { kind = MP_TOKEN_BYTES; n_char = 2; } + #if MICROPY_PY_FSTRINGS + if (is_char_following(lex, 'f')) { + // raw-f-strings unsupported, immediately return (invalid) token. + lex->tok_kind = MP_TOKEN_FSTRING_RAW; + break; + } + #endif } + #if MICROPY_PY_FSTRINGS + else if (is_char(lex, 'f')) { + if (is_char_following(lex, 'r')) { + // raw-f-strings unsupported, immediately return (invalid) token. + lex->tok_kind = MP_TOKEN_FSTRING_RAW; + break; + } + n_char = 1; + is_fstring = true; + } + #endif // Set or check token kind if (lex->tok_kind == MP_TOKEN_END) { @@ -543,7 +662,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) { } // Parse the literal - parse_string_literal(lex, is_raw); + parse_string_literal(lex, is_raw, is_fstring); // Skip whitespace so we can check if there's another string following skip_whitespace(lex, true); @@ -703,6 +822,9 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) { lex->num_indent_level = 1; lex->indent_level = m_new(uint16_t, lex->alloc_indent_level); vstr_init(&lex->vstr, 32); + #if MICROPY_PY_FSTRINGS + vstr_init(&lex->fstring_args, 0); + #endif // store sentinel for first indentation level lex->indent_level[0] = 0; diff --git a/py/lexer.h b/py/lexer.h index 91767a44bf..e16b9a8ce8 100644 --- a/py/lexer.h +++ b/py/lexer.h @@ -44,6 +44,10 @@ typedef enum _mp_token_kind_t { MP_TOKEN_INVALID, MP_TOKEN_DEDENT_MISMATCH, MP_TOKEN_LONELY_STRING_OPEN, + #if MICROPY_PY_FSTRINGS + MP_TOKEN_MALFORMED_FSTRING, + MP_TOKEN_FSTRING_RAW, + #endif MP_TOKEN_NEWLINE, MP_TOKEN_INDENT, @@ -158,6 +162,9 @@ typedef struct _mp_lexer_t { mp_reader_t reader; // stream source unichar chr0, chr1, chr2; // current cached characters from source + #if MICROPY_PY_FSTRINGS + unichar chr0_saved, chr1_saved, chr2_saved; // current cached characters from alt source + #endif size_t line; // current source line size_t column; // current source column @@ -173,6 +180,10 @@ typedef struct _mp_lexer_t { size_t tok_column; // token source column mp_token_kind_t tok_kind; // token kind vstr_t vstr; // token data + #if MICROPY_PY_FSTRINGS + vstr_t fstring_args; // extracted arguments to pass to .format() + size_t fstring_args_idx; // how many bytes of fstring_args have been read + #endif } mp_lexer_t; mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader); diff --git a/py/mpconfig.h b/py/mpconfig.h index a91c39b018..a5d639efe8 100644 --- a/py/mpconfig.h +++ b/py/mpconfig.h @@ -875,6 +875,11 @@ typedef double mp_float_t; #define MICROPY_PY_ASYNC_AWAIT (1) #endif +// Support for literal string interpolation, f-strings (see PEP 498, Python 3.6+) +#ifndef MICROPY_PY_FSTRINGS +#define MICROPY_PY_FSTRINGS (0) +#endif + // Support for assignment expressions with := (see PEP 572, Python 3.8+) #ifndef MICROPY_PY_ASSIGN_EXPR #define MICROPY_PY_ASSIGN_EXPR (1) diff --git a/py/parse.c b/py/parse.c index da2f5e796d..ae3fa8ea6d 100644 --- a/py/parse.c +++ b/py/parse.c @@ -1152,6 +1152,14 @@ mp_parse_tree_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) { } else if (lex->tok_kind == MP_TOKEN_DEDENT_MISMATCH) { exc = mp_obj_new_exception_msg(&mp_type_IndentationError, MP_ERROR_TEXT("unindent doesn't match any outer indent level")); + #if MICROPY_PY_FSTRINGS + } else if (lex->tok_kind == MP_TOKEN_MALFORMED_FSTRING) { + exc = mp_obj_new_exception_msg(&mp_type_SyntaxError, + MP_ERROR_TEXT("malformed f-string")); + } else if (lex->tok_kind == MP_TOKEN_FSTRING_RAW) { + exc = mp_obj_new_exception_msg(&mp_type_SyntaxError, + MP_ERROR_TEXT("raw f-strings are not supported")); + #endif } else { exc = mp_obj_new_exception_msg(&mp_type_SyntaxError, MP_ERROR_TEXT("invalid syntax")); diff --git a/tests/basics/string_fstring.py b/tests/basics/string_fstring.py new file mode 100644 index 0000000000..efb7e5a8e1 --- /dev/null +++ b/tests/basics/string_fstring.py @@ -0,0 +1,57 @@ +def f(): + return 4 +def g(_): + return 5 +def h(): + return 6 + +print(f'no interpolation') +print(f"no interpolation") +print(f"""no interpolation""") + +x, y = 1, 2 +print(f'{x}') +print(f'{x:08x}') +print(f'{x=}') +print(f'{x=:08x}') +print(f'a {x} b {y} c') +print(f'a {x:08x} b {y} c') +print(f'a {x=} b {y} c') +print(f'a {x=:08x} b {y} c') + +print(f'a {"hello"} b') +print(f'a {f() + g("foo") + h()} b') +print(f'a {f() + g("foo") + h()=} b') +print(f'a {f() + g("foo") + h()=:08x} b') + +def foo(a, b): + return f'{x}{y}{a}{b}' +print(foo(7, 8)) + +# PEP-0498 specifies that '\\' and '#' must be disallowed explicitly, whereas +# MicroPython relies on the syntax error as a result of the substitution. + +print(f"\\") +print(f'#') +try: + eval("f'{\}'") +except SyntaxError: + print('SyntaxError') +try: + eval("f'{#}'") +except SyntaxError: + print('SyntaxError') + + +# PEP-0498 specifies that handling of double braces '{{' or '}}' should +# behave like str.format. +print(f'{{}}') +print(f'{{{4*10}}}', '{40}') + +# A single closing brace, unlike str.format should raise a syntax error. +# MicroPython instead raises ValueError at runtime from the substitution. +try: + eval("f'{{}'") +except (ValueError, SyntaxError): + # MicroPython incorrectly raises ValueError here. + print('SyntaxError') diff --git a/tests/cmdline/cmd_parsetree.py b/tests/cmdline/cmd_parsetree.py index 50da369543..483ea89373 100644 --- a/tests/cmdline/cmd_parsetree.py +++ b/tests/cmdline/cmd_parsetree.py @@ -10,3 +10,4 @@ d = b"bytes" e = b"a very long bytes that will not be interned" f = 123456789012345678901234567890 g = 123 +h = f"fstring: '{b}'" diff --git a/tests/cmdline/cmd_parsetree.py.exp b/tests/cmdline/cmd_parsetree.py.exp index e64f4f7829..cc8ba82c05 100644 --- a/tests/cmdline/cmd_parsetree.py.exp +++ b/tests/cmdline/cmd_parsetree.py.exp @@ -1,6 +1,6 @@ ---------------- -[ 4] \(rule\|file_input_2\)(1) (n=9) - tok(4) +[ 4] \(rule\|file_input_2\)(1) (n=10) + tok(6) [ 4] \(rule\|for_stmt\)(22) (n=4) id(i) [ 4] \(rule\|atom_paren\)(45) (n=1) @@ -9,7 +9,7 @@ NULL [ 6] \(rule\|expr_stmt\)(5) (n=2) id(a) - tok(14) + tok(16) [ 7] \(rule\|expr_stmt\)(5) (n=2) id(b) str(str) @@ -28,6 +28,16 @@ [ 12] \(rule\|expr_stmt\)(5) (n=2) id(g) int(123) +[ 13] \(rule\|expr_stmt\)(5) (n=2) + id(h) +[ 13] \(rule\|atom_expr_normal\)(44) (n=2) +[ 13] literal const(\.\+) +[ 13] \(rule\|atom_expr_trailers\)(142) (n=2) +[ 13] \(rule\|trailer_period\)(50) (n=1) + id(format) +[ 13] \(rule\|trailer_paren\)(48) (n=1) +[ 13] \(rule\|arglist\)(164) (n=1) + id(b) ---------------- File cmdline/cmd_parsetree.py, code block '' (descriptor: \.\+, bytecode @\.\+ bytes) Raw bytecode (code_info_size=\\d\+, bytecode_size=\\d\+): @@ -46,6 +56,7 @@ arg names: bc=32 line=10 bc=37 line=11 bc=42 line=12 + bc=48 line=13 00 BUILD_TUPLE 0 02 GET_ITER_STACK 03 FOR_ITER 12 @@ -65,8 +76,13 @@ arg names: 39 STORE_NAME f 42 LOAD_CONST_SMALL_INT 123 45 STORE_NAME g -48 LOAD_CONST_NONE -49 RETURN_VALUE +48 LOAD_CONST_OBJ \.\+ +50 LOAD_METHOD format +53 LOAD_NAME b (cache=0) +57 CALL_METHOD n=1 nkw=0 +59 STORE_NAME h +62 LOAD_CONST_NONE +63 RETURN_VALUE mem: total=\\d\+, current=\\d\+, peak=\\d\+ stack: \\d\+ out of \\d\+ GC: total: \\d\+, used: \\d\+, free: \\d\+ diff --git a/tests/cpydiff/core_fstring_concat.py b/tests/cpydiff/core_fstring_concat.py new file mode 100644 index 0000000000..fd83527b5c --- /dev/null +++ b/tests/cpydiff/core_fstring_concat.py @@ -0,0 +1,12 @@ +""" +categories: Core +description: f-strings don't support concatenation with adjacent literals if the adjacent literals contain braces +cause: MicroPython is optimised for code space. +workaround: Use the + operator between literal strings when either is an f-string +""" + +x = 1 +print("aa" f"{x}") +print(f"{x}" "ab") +print("a{}a" f"{x}") +print(f"{x}" "a{}b") diff --git a/tests/cpydiff/core_fstring_parser.py b/tests/cpydiff/core_fstring_parser.py new file mode 100644 index 0000000000..6917f3cfa4 --- /dev/null +++ b/tests/cpydiff/core_fstring_parser.py @@ -0,0 +1,9 @@ +""" +categories: Core +description: f-strings cannot support expressions that require parsing to resolve nested braces +cause: MicroPython is optimised for code space. +workaround: Only use simple expressions inside f-strings +""" + +f'{"hello {} world"}' +f"{repr({})}" diff --git a/tests/cpydiff/core_fstring_raw.py b/tests/cpydiff/core_fstring_raw.py new file mode 100644 index 0000000000..84e265f70f --- /dev/null +++ b/tests/cpydiff/core_fstring_raw.py @@ -0,0 +1,8 @@ +""" +categories: Core +description: Raw f-strings are not supported +cause: MicroPython is optimised for code space. +workaround: Unknown +""" + +rf"hello" diff --git a/tests/cpydiff/core_fstring_repr.py b/tests/cpydiff/core_fstring_repr.py new file mode 100644 index 0000000000..fcadcbf1b9 --- /dev/null +++ b/tests/cpydiff/core_fstring_repr.py @@ -0,0 +1,18 @@ +""" +categories: Core +description: f-strings don't support the !r, !s, and !a conversions +cause: MicroPython is optimised for code space. +workaround: Use repr(), str(), and ascii() explictly. +""" + + +class X: + def __repr__(self): + return "repr" + + def __str__(self): + return "str" + + +print(f"{X()!r}") +print(f"{X()!s}") diff --git a/tests/feature_check/fstring.py b/tests/feature_check/fstring.py new file mode 100644 index 0000000000..14792bce0a --- /dev/null +++ b/tests/feature_check/fstring.py @@ -0,0 +1,3 @@ +# check whether f-strings (PEP-498) are supported +a = 1 +print(f"a={a}") diff --git a/tests/feature_check/fstring.py.exp b/tests/feature_check/fstring.py.exp new file mode 100644 index 0000000000..73cdb8bcc8 --- /dev/null +++ b/tests/feature_check/fstring.py.exp @@ -0,0 +1 @@ +a=1 diff --git a/tests/run-tests.py b/tests/run-tests.py index 619df5ed36..3e97a7c87d 100755 --- a/tests/run-tests.py +++ b/tests/run-tests.py @@ -290,6 +290,7 @@ def run_tests(pyb, tests, args, result_dir, num_threads=1): skip_const = False skip_revops = False skip_io_module = False + skip_fstring = False skip_endian = False has_complex = True has_coverage = False @@ -348,6 +349,11 @@ def run_tests(pyb, tests, args, result_dir, num_threads=1): if output != b"uio\n": skip_io_module = True + # Check if fstring feature is enabled, and skip such tests if it doesn't + output = run_feature_check(pyb, args, base_path, "fstring.py") + if output != b"a=1\n": + skip_fstring = True + # Check if emacs repl is supported, and skip such tests if it's not t = run_feature_check(pyb, args, base_path, "repl_emacs_check.py") if "True" not in str(t, "ascii"): @@ -543,6 +549,7 @@ def run_tests(pyb, tests, args, result_dir, num_threads=1): is_async = test_name.startswith(("async_", "uasyncio_")) is_const = test_name.startswith("const") is_io_module = test_name.startswith("io_") + is_fstring = test_name.startswith("string_fstring") skip_it = test_file in skip_tests skip_it |= skip_native and is_native @@ -555,6 +562,7 @@ def run_tests(pyb, tests, args, result_dir, num_threads=1): skip_it |= skip_const and is_const skip_it |= skip_revops and "reverse_op" in test_name skip_it |= skip_io_module and is_io_module + skip_it |= skip_fstring and is_fstring if args.list_tests: if not skip_it: diff --git a/tools/tinytest-codegen.py b/tools/tinytest-codegen.py index cba0b94480..ddbe62b846 100755 --- a/tools/tinytest-codegen.py +++ b/tools/tinytest-codegen.py @@ -99,6 +99,8 @@ exclude_tests = ( "misc/sys_settrace_loop.py", "misc/sys_settrace_generator.py", "misc/sys_settrace_features.py", + # don't have f-string + "basics/string_fstring.py", ) output = []