Skip to content

Tokenizer produces token '<>', missing in the language grammar (regardless on __future__.barry_as_FLUFL settings) #151464

@skirpichev

Description

@skirpichev

Long live to the SC!

This is outcome of the #151303. In particular, that issue proposed a friendlier syntax error message for code like 1 <> 2.

Now consider this:

$ cat a.py
1 <> 2
$ cat a.py | python3.14 -m tokenize
1,0-1,1:            NUMBER         '1'            
1,2-1,4:            OP             '<>'           
1,5-1,6:            NUMBER         '2'            
1,6-1,7:            NEWLINE        '\n'           
2,0-2,0:            ENDMARKER      ''             

vs

$ cat b.py
1 !! 2
$ cat b.py | python3.14 -m tokenize
1,0-1,1:            NUMBER         '1'            
1,2-1,3:            OP             '!'            
1,3-1,4:            OP             '!'            
1,5-1,6:            NUMBER         '2'            
1,6-1,7:            NEWLINE        '\n'           
2,0-2,0:            ENDMARKER      ''             

The Python grammar has no official token <>, but that's not true: the evil <> (as synonym for !=) was introduced by previous (unsuccessful) evil attempt to overthrow Guido (Long live the BDFL!).

I think we are all in danger. The attached patch (enables different tokenization just in the REPL) shows that in principle it's possible to apply __future__ settings to the tokenizer too.

The Holy Patch, That Save The World
diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c
index 7f25afec302..dd982a1f68d 100644
--- a/Parser/lexer/lexer.c
+++ b/Parser/lexer/lexer.c
@@ -1280,7 +1280,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
     /* Check for two-character token */
     {
         int c2 = tok_nextc(tok);
-        int current_token = _PyToken_TwoChars(c, c2);
+        int current_token = _PyToken_TwoChars(c, c2, tok->BARRY_AS_BDFL);
         if (current_token != OP) {
             int c3 = tok_nextc(tok);
             int current_token3 = _PyToken_ThreeChars(c, c2, c3);
diff --git a/Parser/lexer/state.c b/Parser/lexer/state.c
index 5cf9b4d768c..72daaa2a464 100644
--- a/Parser/lexer/state.c
+++ b/Parser/lexer/state.c
@@ -63,6 +63,7 @@ _PyTokenizer_tok_new(void)
 #ifdef Py_DEBUG
     tok->debug = _Py_GetConfig()->parser_debug;
 #endif
+    tok->BARRY_AS_BDFL = 0;
     return tok;
 }
 
diff --git a/Parser/lexer/state.h b/Parser/lexer/state.h
index 9cd196a114c..c367a4a0b7d 100644
--- a/Parser/lexer/state.h
+++ b/Parser/lexer/state.h
@@ -137,6 +137,7 @@ struct tok_state {
 #ifdef Py_DEBUG
     int debug;
 #endif
+    int BARRY_AS_BDFL;
 };
 
 int _PyLexer_type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
diff --git a/Parser/pegen.c b/Parser/pegen.c
index 569f5afb312..9954231cb41 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -1042,10 +1042,11 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen
                        PyCompilerFlags *flags, PyArena *arena, PyObject *module)
 {
     int exec_input = start_rule == Py_file_input;
+    int parser_flags = compute_parser_flags(flags);
 
     struct tok_state *tok;
     if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
-        tok = _PyTokenizer_FromUTF8(str, exec_input, 0);
+        tok = _PyTokenizer_FromUTF8(str, exec_input, 0, parser_flags & PyPARSE_BARRY_AS_BDFL);
     } else {
         tok = _PyTokenizer_FromString(str, exec_input, 0);
     }
@@ -1062,7 +1063,6 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen
     // We need to clear up from here on
     mod_ty result = NULL;
 
-    int parser_flags = compute_parser_flags(flags);
     int feature_version = flags && (flags->cf_flags & PyCF_ONLY_AST) ?
         flags->cf_feature_version : PY_MINOR_VERSION;
     Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, feature_version,
diff --git a/Parser/tokenizer/tokenizer.h b/Parser/tokenizer/tokenizer.h
index 8fbeb2d6ae6..bf04cf40a48 100644
--- a/Parser/tokenizer/tokenizer.h
+++ b/Parser/tokenizer/tokenizer.h
@@ -4,7 +4,7 @@
 #include "Python.h"
 
 struct tok_state *_PyTokenizer_FromString(const char *, int, int);
-struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
+struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int, int);
 struct tok_state *_PyTokenizer_FromReadline(PyObject*, const char*, int, int);
 struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
                                               const char *, const char *);

diff --git a/Parser/tokenizer/utf8_tokenizer.c b/Parser/tokenizer/utf8_tokenizer.c
index 1a925f44540..f31a7fe606a 100644
--- a/Parser/tokenizer/utf8_tokenizer.c
+++ b/Parser/tokenizer/utf8_tokenizer.c
@@ -28,7 +28,8 @@ tok_underflow_string(struct tok_state *tok) {
 
 /* Set up tokenizer for UTF-8 string */
 struct tok_state *
-_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf)
+_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf,
+                      int BARRY_AS_BDFL)
 {
     struct tok_state *tok = _PyTokenizer_tok_new();
     char *translated;
@@ -51,5 +52,6 @@ _PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf)
     tok->buf = tok->cur = tok->inp = translated;
     tok->end = translated;
     tok->underflow = &tok_underflow_string;
+    tok->BARRY_AS_BDFL = BARRY_AS_BDFL;
     return tok;
 }
diff --git a/Tools/build/generate_token.py b/Tools/build/generate_token.py
index 9ee5ec86e75..c7603e212d2 100755
--- a/Tools/build/generate_token.py
+++ b/Tools/build/generate_token.py
@@ -94,7 +94,7 @@ def update_file(file, content):
 // Export these 4 symbols for 'test_peg_generator'
 PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
 PyAPI_FUNC(int) _PyToken_OneChar(int);
-PyAPI_FUNC(int) _PyToken_TwoChars(int, int);
+PyAPI_FUNC(int) _PyToken_TwoChars(int, int, int);
 PyAPI_FUNC(int) _PyToken_ThreeChars(int, int, int);
 
 #ifdef __cplusplus
@@ -142,7 +142,7 @@ def make_h(infile, outfile='Include/internal/pycore_token.h'):
 }
 
 int
-_PyToken_TwoChars(int c1, int c2)
+_PyToken_TwoChars(int c1, int c2, int BARRY_AS_BDFL)
 {
 %s\
     return OP;
@@ -171,7 +171,10 @@ def generate_chars_to_token(mapping, n=1):
             write(indent)
             write('    break;\n')
         else:
-            write("case '%s': return %s;\n" % (c, value))
+            if c == '>' and value == 'NOTEQUAL':
+                write("case '%s': return BARRY_AS_BDFL ? %s : OP;\n" % (c, value))
+            else:
+                write("case '%s': return %s;\n" % (c, value))
     write(indent)
     write('}\n')
     return ''.join(result)

Should we apply something like this? Of course, referenced issue could be solved and with the evil token. Though, @warsaw might overthrow the SC. BTW, Long live the FLUFL too! (I think it's safer to glorify all dictators, just in case.)

Metadata

Metadata

Assignees

Labels

interpreter-core(Objects, Python, Grammar, and Parser dirs)topic-parsertype-bugAn unexpected behavior, bug, or error
No fields configured for issues without a type.

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions