From 0b7184dcb8250e6654e55174753b24e4cbfea052 Mon Sep 17 00:00:00 2001 From: Paul Sokolovsky Date: Wed, 22 Jan 2014 22:40:02 +0200 Subject: [PATCH] Implement octal and hex escapes in strings. --- py/lexer.c | 64 +++++++++++++++++++++++++++++++---- py/misc.h | 1 + py/unicode.c | 4 +++ tests/basics/string-escape.py | 11 ++++++ 4 files changed, 74 insertions(+), 6 deletions(-) create mode 100644 tests/basics/string-escape.py diff --git a/py/lexer.c b/py/lexer.c index daaeebf511..af413021b1 100644 --- a/py/lexer.c +++ b/py/lexer.c @@ -126,6 +126,10 @@ static bool is_following_digit(mp_lexer_t *lex) { return unichar_isdigit(lex->chr1); } +static bool is_following_odigit(mp_lexer_t *lex) { + return lex->chr1 >= '0' && lex->chr1 <= '7'; +} + // TODO UNICODE include unicode characters in definition of identifiers static bool is_head_of_identifier(mp_lexer_t *lex) { return is_letter(lex) || lex->chr0 == '_'; @@ -275,6 +279,32 @@ static const char *tok_kw[] = { NULL, }; +static int hex_digit(unichar c) { + // c is assumed to be hex digit + int n = c - '0'; + if (n > 9) { + n &= ~('a' - 'A'); + n -= ('A' - ('9' + 1)); + } + return n; +} + +// This is called with CUR_CHAR() before first hex digit, and should return with +// it pointing to last hex digit +static bool get_hex(mp_lexer_t *lex, int num_digits, uint *result) { + uint num = 0; + while (num_digits-- != 0) { + next_char(lex); + unichar c = CUR_CHAR(lex); + if (!unichar_isxdigit(c)) { + return false; + } + num = (num << 4) + hex_digit(c); + } + *result = num; + return true; +} + static void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) { // skip white space and comments bool had_physical_newline = false; @@ -439,12 +469,34 @@ static void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs case 'v': c = 0x0b; break; case 'f': c = 0x0c; break; case 'r': c = 0x0d; break; - // TODO \ooo octal - case 'x': // TODO \xhh - case 'N': // TODO \N{name} only in strings - case 'u': // TODO \uxxxx only in strings - case 'U': // TODO \Uxxxxxxxx only in strings - default: break; // TODO error message + case 'x': + { + uint num; + if (!get_hex(lex, 2, &num)) { + // TODO error message + assert(0); + } + c = num; + break; + } + case 'N': break; // TODO \N{name} only in strings + case 'u': break; // TODO \uxxxx only in strings + case 'U': break; // TODO \Uxxxxxxxx only in strings + default: + if (c >= '0' && c <= '7') { + // Octal sequence, 1-3 chars + int digits = 3; + int num = c - '0'; + while (is_following_odigit(lex) && --digits != 0) { + next_char(lex); + num = num * 8 + (CUR_CHAR(lex) - '0'); + } + c = num; + } else { + // TODO error message + assert(0); + } + break; } if (c != MP_LEXER_CHAR_EOF) { vstr_add_char(&lex->vstr, c); diff --git a/py/misc.h b/py/misc.h index 8756c25a07..52498c70bd 100644 --- a/py/misc.h +++ b/py/misc.h @@ -43,6 +43,7 @@ bool unichar_isspace(unichar c); bool unichar_isalpha(unichar c); bool unichar_isprint(unichar c); bool unichar_isdigit(unichar c); +bool unichar_isxdigit(unichar c); /** string ******************************************************/ diff --git a/py/unicode.c b/py/unicode.c index 58c860a0e4..52bc9b9f60 100644 --- a/py/unicode.c +++ b/py/unicode.c @@ -62,6 +62,10 @@ bool unichar_isdigit(unichar c) { return c < 128 && (attr[c] & FL_DIGIT) != 0; } +bool unichar_isxdigit(unichar c) { + return unichar_isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); +} + /* bool char_is_alpha_or_digit(unichar c) { return c < 128 && (attr[c] & (FL_ALPHA | FL_DIGIT)) != 0; diff --git a/tests/basics/string-escape.py b/tests/basics/string-escape.py new file mode 100644 index 0000000000..000a8713e6 --- /dev/null +++ b/tests/basics/string-escape.py @@ -0,0 +1,11 @@ +a = "a\1b" +print(len(a)) +print(ord(a[1])) +print(len("a\123b")) +a = "a\12345b" +print(len(a)) +print(ord(a[1])) + +a = "a\xffb" +print(len(a)) +print(ord(a[1]))