py: Convert CR to LF and CR LF to LF in lexer.
Only noticeable difference is how newlines are encoded in triple-quoted strings. The behaviour now matches CPython3.
This commit is contained in:
parent
3da677e658
commit
32bade19d9
51
py/lexer.c
51
py/lexer.c
|
@ -55,7 +55,7 @@ STATIC bool is_end(mp_lexer_t *lex) {
|
||||||
}
|
}
|
||||||
|
|
||||||
STATIC bool is_physical_newline(mp_lexer_t *lex) {
|
STATIC bool is_physical_newline(mp_lexer_t *lex) {
|
||||||
return lex->chr0 == '\n' || lex->chr0 == '\r';
|
return lex->chr0 == '\n';
|
||||||
}
|
}
|
||||||
|
|
||||||
STATIC bool is_char(mp_lexer_t *lex, char c) {
|
STATIC bool is_char(mp_lexer_t *lex, char c) {
|
||||||
|
@ -123,20 +123,10 @@ STATIC void next_char(mp_lexer_t *lex) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
mp_uint_t advance = 1;
|
|
||||||
|
|
||||||
if (lex->chr0 == '\n') {
|
if (lex->chr0 == '\n') {
|
||||||
// LF is a new line
|
// a new line
|
||||||
++lex->line;
|
++lex->line;
|
||||||
lex->column = 1;
|
lex->column = 1;
|
||||||
} else if (lex->chr0 == '\r') {
|
|
||||||
// CR is a new line
|
|
||||||
++lex->line;
|
|
||||||
lex->column = 1;
|
|
||||||
if (lex->chr1 == '\n') {
|
|
||||||
// CR LF is a single new line
|
|
||||||
advance = 2;
|
|
||||||
}
|
|
||||||
} else if (lex->chr0 == '\t') {
|
} else if (lex->chr0 == '\t') {
|
||||||
// a tab
|
// a tab
|
||||||
lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
|
lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
|
||||||
|
@ -145,15 +135,26 @@ STATIC void next_char(mp_lexer_t *lex) {
|
||||||
++lex->column;
|
++lex->column;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (; advance > 0; advance--) {
|
lex->chr0 = lex->chr1;
|
||||||
lex->chr0 = lex->chr1;
|
lex->chr1 = lex->chr2;
|
||||||
lex->chr1 = lex->chr2;
|
lex->chr2 = lex->stream_next_byte(lex->stream_data);
|
||||||
lex->chr2 = lex->stream_next_byte(lex->stream_data);
|
|
||||||
if (lex->chr2 == MP_LEXER_EOF) {
|
if (lex->chr0 == '\r') {
|
||||||
// EOF
|
// CR is a new line, converted to LF
|
||||||
if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
|
lex->chr0 = '\n';
|
||||||
lex->chr2 = '\n'; // insert newline at end of file
|
if (lex->chr1 == '\n') {
|
||||||
}
|
// CR LF is a single new line
|
||||||
|
lex->chr1 = lex->chr2;
|
||||||
|
lex->chr2 = lex->stream_next_byte(lex->stream_data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lex->chr2 == MP_LEXER_EOF) {
|
||||||
|
// EOF, check if we need to insert a newline at end of file
|
||||||
|
if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
|
||||||
|
// if lex->chr1 == '\r' then this makes a CR LF which will be converted to LF above
|
||||||
|
// otherwise it just inserts a LF
|
||||||
|
lex->chr2 = '\n';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -721,11 +722,15 @@ mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_
|
||||||
if (lex->chr0 == MP_LEXER_EOF) {
|
if (lex->chr0 == MP_LEXER_EOF) {
|
||||||
lex->chr0 = '\n';
|
lex->chr0 = '\n';
|
||||||
} else if (lex->chr1 == MP_LEXER_EOF) {
|
} else if (lex->chr1 == MP_LEXER_EOF) {
|
||||||
if (lex->chr0 != '\n' && lex->chr0 != '\r') {
|
if (lex->chr0 == '\r') {
|
||||||
|
lex->chr0 = '\n';
|
||||||
|
} else if (lex->chr0 != '\n') {
|
||||||
lex->chr1 = '\n';
|
lex->chr1 = '\n';
|
||||||
}
|
}
|
||||||
} else if (lex->chr2 == MP_LEXER_EOF) {
|
} else if (lex->chr2 == MP_LEXER_EOF) {
|
||||||
if (lex->chr1 != '\n' && lex->chr1 != '\r') {
|
if (lex->chr1 == '\r') {
|
||||||
|
lex->chr1 = '\n';
|
||||||
|
} else if (lex->chr1 != '\n') {
|
||||||
lex->chr2 = '\n';
|
lex->chr2 = '\n';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
# this file has CR line endings to test lexer's conversion of them to LF
# in triple quoted strings
print(repr("""abc
def"""))
|
|
@ -0,0 +1,4 @@
|
||||||
|
# this file has CRLF line endings to test lexer's conversion of them to LF
|
||||||
|
# in triple quoted strings
|
||||||
|
print(repr("""abc
|
||||||
|
def"""))
|
Loading…
Reference in New Issue