py: Change lexer stream API to return bytes not chars.
Lexer is now 8-bit clean inside strings.
This commit is contained in:
parent
07133415d2
commit
94fbe9711a
38
py/lexer.c
38
py/lexer.c
|
@ -45,7 +45,7 @@
|
||||||
struct _mp_lexer_t {
|
struct _mp_lexer_t {
|
||||||
qstr source_name; // name of source
|
qstr source_name; // name of source
|
||||||
void *stream_data; // data for stream
|
void *stream_data; // data for stream
|
||||||
mp_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
|
mp_lexer_stream_next_byte_t stream_next_byte; // stream callback to get next byte
|
||||||
mp_lexer_stream_close_t stream_close; // stream callback to free
|
mp_lexer_stream_close_t stream_close; // stream callback to free
|
||||||
|
|
||||||
unichar chr0, chr1, chr2; // current cached characters from source
|
unichar chr0, chr1, chr2; // current cached characters from source
|
||||||
|
@ -103,7 +103,7 @@ void mp_token_show(const mp_token_t *tok) {
|
||||||
#define CUR_CHAR(lex) ((lex)->chr0)
|
#define CUR_CHAR(lex) ((lex)->chr0)
|
||||||
|
|
||||||
STATIC bool is_end(mp_lexer_t *lex) {
|
STATIC bool is_end(mp_lexer_t *lex) {
|
||||||
return lex->chr0 == MP_LEXER_CHAR_EOF;
|
return lex->chr0 == MP_LEXER_EOF;
|
||||||
}
|
}
|
||||||
|
|
||||||
STATIC bool is_physical_newline(mp_lexer_t *lex) {
|
STATIC bool is_physical_newline(mp_lexer_t *lex) {
|
||||||
|
@ -171,7 +171,7 @@ STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
|
||||||
}
|
}
|
||||||
|
|
||||||
STATIC void next_char(mp_lexer_t *lex) {
|
STATIC void next_char(mp_lexer_t *lex) {
|
||||||
if (lex->chr0 == MP_LEXER_CHAR_EOF) {
|
if (lex->chr0 == MP_LEXER_EOF) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -200,10 +200,10 @@ STATIC void next_char(mp_lexer_t *lex) {
|
||||||
for (; advance > 0; advance--) {
|
for (; advance > 0; advance--) {
|
||||||
lex->chr0 = lex->chr1;
|
lex->chr0 = lex->chr1;
|
||||||
lex->chr1 = lex->chr2;
|
lex->chr1 = lex->chr2;
|
||||||
lex->chr2 = lex->stream_next_char(lex->stream_data);
|
lex->chr2 = lex->stream_next_byte(lex->stream_data);
|
||||||
if (lex->chr2 == MP_LEXER_CHAR_EOF) {
|
if (lex->chr2 == MP_LEXER_EOF) {
|
||||||
// EOF
|
// EOF
|
||||||
if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
|
if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
|
||||||
lex->chr2 = '\n'; // insert newline at end of file
|
lex->chr2 = '\n'; // insert newline at end of file
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -491,8 +491,8 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
|
||||||
vstr_add_char(&lex->vstr, '\\');
|
vstr_add_char(&lex->vstr, '\\');
|
||||||
} else {
|
} else {
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
|
case MP_LEXER_EOF: break; // TODO a proper error message?
|
||||||
case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
|
case '\n': c = MP_LEXER_EOF; break; // TODO check this works correctly (we are supposed to ignore it
|
||||||
case '\\': break;
|
case '\\': break;
|
||||||
case '\'': break;
|
case '\'': break;
|
||||||
case '"': break;
|
case '"': break;
|
||||||
|
@ -546,7 +546,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (c != MP_LEXER_CHAR_EOF) {
|
if (c != MP_LEXER_EOF) {
|
||||||
if (c < 0x110000 && !is_bytes) {
|
if (c < 0x110000 && !is_bytes) {
|
||||||
vstr_add_char(&lex->vstr, c);
|
vstr_add_char(&lex->vstr, c);
|
||||||
} else if (c < 0x100 && is_bytes) {
|
} else if (c < 0x100 && is_bytes) {
|
||||||
|
@ -556,7 +556,9 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
|
// Add the "character" as a byte so that we remain 8-bit clean.
|
||||||
|
// This way, strings are parsed correctly whether or not they contain utf-8 chars.
|
||||||
|
vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
next_char(lex);
|
next_char(lex);
|
||||||
|
@ -728,7 +730,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
|
mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close) {
|
||||||
mp_lexer_t *lex = m_new_maybe(mp_lexer_t, 1);
|
mp_lexer_t *lex = m_new_maybe(mp_lexer_t, 1);
|
||||||
|
|
||||||
// check for memory allocation error
|
// check for memory allocation error
|
||||||
|
@ -741,7 +743,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_
|
||||||
|
|
||||||
lex->source_name = src_name;
|
lex->source_name = src_name;
|
||||||
lex->stream_data = stream_data;
|
lex->stream_data = stream_data;
|
||||||
lex->stream_next_char = stream_next_char;
|
lex->stream_next_byte = stream_next_byte;
|
||||||
lex->stream_close = stream_close;
|
lex->stream_close = stream_close;
|
||||||
lex->line = 1;
|
lex->line = 1;
|
||||||
lex->column = 1;
|
lex->column = 1;
|
||||||
|
@ -762,18 +764,18 @@ mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_
|
||||||
lex->indent_level[0] = 0;
|
lex->indent_level[0] = 0;
|
||||||
|
|
||||||
// preload characters
|
// preload characters
|
||||||
lex->chr0 = stream_next_char(stream_data);
|
lex->chr0 = stream_next_byte(stream_data);
|
||||||
lex->chr1 = stream_next_char(stream_data);
|
lex->chr1 = stream_next_byte(stream_data);
|
||||||
lex->chr2 = stream_next_char(stream_data);
|
lex->chr2 = stream_next_byte(stream_data);
|
||||||
|
|
||||||
// if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
|
// if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
|
||||||
if (lex->chr0 == MP_LEXER_CHAR_EOF) {
|
if (lex->chr0 == MP_LEXER_EOF) {
|
||||||
lex->chr0 = '\n';
|
lex->chr0 = '\n';
|
||||||
} else if (lex->chr1 == MP_LEXER_CHAR_EOF) {
|
} else if (lex->chr1 == MP_LEXER_EOF) {
|
||||||
if (lex->chr0 != '\n' && lex->chr0 != '\r') {
|
if (lex->chr0 != '\n' && lex->chr0 != '\r') {
|
||||||
lex->chr1 = '\n';
|
lex->chr1 = '\n';
|
||||||
}
|
}
|
||||||
} else if (lex->chr2 == MP_LEXER_CHAR_EOF) {
|
} else if (lex->chr2 == MP_LEXER_EOF) {
|
||||||
if (lex->chr1 != '\n' && lex->chr1 != '\r') {
|
if (lex->chr1 != '\n' && lex->chr1 != '\r') {
|
||||||
lex->chr2 = '\n';
|
lex->chr2 = '\n';
|
||||||
}
|
}
|
||||||
|
|
12
py/lexer.h
12
py/lexer.h
|
@ -139,18 +139,18 @@ typedef struct _mp_token_t {
|
||||||
mp_uint_t len; // (byte) length of string of token
|
mp_uint_t len; // (byte) length of string of token
|
||||||
} mp_token_t;
|
} mp_token_t;
|
||||||
|
|
||||||
// the next-char function must return the next character in the stream
|
// the next-byte function must return the next byte in the stream
|
||||||
// it must return MP_LEXER_CHAR_EOF if end of stream
|
// it must return MP_LEXER_EOF if end of stream
|
||||||
// it can be called again after returning MP_LEXER_CHAR_EOF, and in that case must return MP_LEXER_CHAR_EOF
|
// it can be called again after returning MP_LEXER_EOF, and in that case must return MP_LEXER_EOF
|
||||||
#define MP_LEXER_CHAR_EOF (-1)
|
#define MP_LEXER_EOF (-1)
|
||||||
typedef unichar (*mp_lexer_stream_next_char_t)(void*);
|
typedef mp_uint_t (*mp_lexer_stream_next_byte_t)(void*);
|
||||||
typedef void (*mp_lexer_stream_close_t)(void*);
|
typedef void (*mp_lexer_stream_close_t)(void*);
|
||||||
|
|
||||||
typedef struct _mp_lexer_t mp_lexer_t;
|
typedef struct _mp_lexer_t mp_lexer_t;
|
||||||
|
|
||||||
void mp_token_show(const mp_token_t *tok);
|
void mp_token_show(const mp_token_t *tok);
|
||||||
|
|
||||||
mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close);
|
mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close);
|
||||||
mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, mp_uint_t len, mp_uint_t free_len);
|
mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, mp_uint_t len, mp_uint_t free_len);
|
||||||
|
|
||||||
void mp_lexer_free(mp_lexer_t *lex);
|
void mp_lexer_free(mp_lexer_t *lex);
|
||||||
|
|
|
@ -36,11 +36,11 @@ typedef struct _mp_lexer_str_buf_t {
|
||||||
const char *src_end; // end (exclusive) of source
|
const char *src_end; // end (exclusive) of source
|
||||||
} mp_lexer_str_buf_t;
|
} mp_lexer_str_buf_t;
|
||||||
|
|
||||||
STATIC unichar str_buf_next_char(mp_lexer_str_buf_t *sb) {
|
STATIC mp_uint_t str_buf_next_byte(mp_lexer_str_buf_t *sb) {
|
||||||
if (sb->src_cur < sb->src_end) {
|
if (sb->src_cur < sb->src_end) {
|
||||||
return *sb->src_cur++;
|
return *sb->src_cur++;
|
||||||
} else {
|
} else {
|
||||||
return MP_LEXER_CHAR_EOF;
|
return MP_LEXER_EOF;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -57,5 +57,5 @@ mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, mp_uint_t
|
||||||
sb->src_beg = str;
|
sb->src_beg = str;
|
||||||
sb->src_cur = str;
|
sb->src_cur = str;
|
||||||
sb->src_end = str + len;
|
sb->src_end = str + len;
|
||||||
return mp_lexer_new(src_name, sb, (mp_lexer_stream_next_char_t)str_buf_next_char, (mp_lexer_stream_close_t)str_buf_free);
|
return mp_lexer_new(src_name, sb, (mp_lexer_stream_next_byte_t)str_buf_next_byte, (mp_lexer_stream_close_t)str_buf_free);
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,20 +41,20 @@
|
||||||
|
|
||||||
typedef struct _mp_lexer_file_buf_t {
|
typedef struct _mp_lexer_file_buf_t {
|
||||||
int fd;
|
int fd;
|
||||||
char buf[20];
|
byte buf[20];
|
||||||
uint len;
|
mp_uint_t len;
|
||||||
uint pos;
|
mp_uint_t pos;
|
||||||
} mp_lexer_file_buf_t;
|
} mp_lexer_file_buf_t;
|
||||||
|
|
||||||
STATIC unichar file_buf_next_char(mp_lexer_file_buf_t *fb) {
|
STATIC mp_uint_t file_buf_next_byte(mp_lexer_file_buf_t *fb) {
|
||||||
if (fb->pos >= fb->len) {
|
if (fb->pos >= fb->len) {
|
||||||
if (fb->len == 0) {
|
if (fb->len == 0) {
|
||||||
return MP_LEXER_CHAR_EOF;
|
return MP_LEXER_EOF;
|
||||||
} else {
|
} else {
|
||||||
int n = read(fb->fd, fb->buf, sizeof(fb->buf));
|
int n = read(fb->fd, fb->buf, sizeof(fb->buf));
|
||||||
if (n <= 0) {
|
if (n <= 0) {
|
||||||
fb->len = 0;
|
fb->len = 0;
|
||||||
return MP_LEXER_CHAR_EOF;
|
return MP_LEXER_EOF;
|
||||||
}
|
}
|
||||||
fb->len = n;
|
fb->len = n;
|
||||||
fb->pos = 0;
|
fb->pos = 0;
|
||||||
|
@ -78,7 +78,7 @@ mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
|
||||||
int n = read(fb->fd, fb->buf, sizeof(fb->buf));
|
int n = read(fb->fd, fb->buf, sizeof(fb->buf));
|
||||||
fb->len = n;
|
fb->len = n;
|
||||||
fb->pos = 0;
|
fb->pos = 0;
|
||||||
return mp_lexer_new(qstr_from_str(filename), fb, (mp_lexer_stream_next_char_t)file_buf_next_char, (mp_lexer_stream_close_t)file_buf_close);
|
return mp_lexer_new(qstr_from_str(filename), fb, (mp_lexer_stream_next_byte_t)file_buf_next_byte, (mp_lexer_stream_close_t)file_buf_close);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // MICROPY_HELPER_LEXER_UNIX
|
#endif // MICROPY_HELPER_LEXER_UNIX
|
||||||
|
|
|
@ -36,20 +36,20 @@
|
||||||
|
|
||||||
typedef struct _mp_lexer_file_buf_t {
|
typedef struct _mp_lexer_file_buf_t {
|
||||||
FIL fp;
|
FIL fp;
|
||||||
char buf[20];
|
byte buf[20];
|
||||||
uint16_t len;
|
uint16_t len;
|
||||||
uint16_t pos;
|
uint16_t pos;
|
||||||
} mp_lexer_file_buf_t;
|
} mp_lexer_file_buf_t;
|
||||||
|
|
||||||
static unichar file_buf_next_char(mp_lexer_file_buf_t *fb) {
|
STATIC mp_uint_t file_buf_next_byte(mp_lexer_file_buf_t *fb) {
|
||||||
if (fb->pos >= fb->len) {
|
if (fb->pos >= fb->len) {
|
||||||
if (fb->len < sizeof(fb->buf)) {
|
if (fb->len < sizeof(fb->buf)) {
|
||||||
return MP_LEXER_CHAR_EOF;
|
return MP_LEXER_EOF;
|
||||||
} else {
|
} else {
|
||||||
UINT n;
|
UINT n;
|
||||||
f_read(&fb->fp, fb->buf, sizeof(fb->buf), &n);
|
f_read(&fb->fp, fb->buf, sizeof(fb->buf), &n);
|
||||||
if (n == 0) {
|
if (n == 0) {
|
||||||
return MP_LEXER_CHAR_EOF;
|
return MP_LEXER_EOF;
|
||||||
}
|
}
|
||||||
fb->len = n;
|
fb->len = n;
|
||||||
fb->pos = 0;
|
fb->pos = 0;
|
||||||
|
@ -58,7 +58,7 @@ static unichar file_buf_next_char(mp_lexer_file_buf_t *fb) {
|
||||||
return fb->buf[fb->pos++];
|
return fb->buf[fb->pos++];
|
||||||
}
|
}
|
||||||
|
|
||||||
static void file_buf_close(mp_lexer_file_buf_t *fb) {
|
STATIC void file_buf_close(mp_lexer_file_buf_t *fb) {
|
||||||
f_close(&fb->fp);
|
f_close(&fb->fp);
|
||||||
m_del_obj(mp_lexer_file_buf_t, fb);
|
m_del_obj(mp_lexer_file_buf_t, fb);
|
||||||
}
|
}
|
||||||
|
@ -74,5 +74,5 @@ mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
|
||||||
f_read(&fb->fp, fb->buf, sizeof(fb->buf), &n);
|
f_read(&fb->fp, fb->buf, sizeof(fb->buf), &n);
|
||||||
fb->len = n;
|
fb->len = n;
|
||||||
fb->pos = 0;
|
fb->pos = 0;
|
||||||
return mp_lexer_new(qstr_from_str(filename), fb, (mp_lexer_stream_next_char_t)file_buf_next_char, (mp_lexer_stream_close_t)file_buf_close);
|
return mp_lexer_new(qstr_from_str(filename), fb, (mp_lexer_stream_next_byte_t)file_buf_next_byte, (mp_lexer_stream_close_t)file_buf_close);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue