Tokenizer produces token '<>', missing in the language grammar (regardless on __future__.barry_as_FLUFL settings)

Long live to the SC!

This is outcome of the https://github.com/python/cpython/issues/151303.  In particular, that issue proposed a friendlier syntax error message for code like ``1 <> 2``.

Now consider this:
```
$ cat a.py
1 <> 2
$ cat a.py | python3.14 -m tokenize
1,0-1,1:            NUMBER         '1'            
1,2-1,4:            OP             '<>'           
1,5-1,6:            NUMBER         '2'            
1,6-1,7:            NEWLINE        '\n'           
2,0-2,0:            ENDMARKER      ''             
```
vs
```
$ cat b.py
1 !! 2
$ cat b.py | python3.14 -m tokenize
1,0-1,1:            NUMBER         '1'            
1,2-1,3:            OP             '!'            
1,3-1,4:            OP             '!'            
1,5-1,6:            NUMBER         '2'            
1,6-1,7:            NEWLINE        '\n'           
2,0-2,0:            ENDMARKER      ''             
```

The Python grammar [has no official token](https://docs.python.org/3.14/reference/lexical_analysis.html#operators-and-delimiters) ``<>``, but that's not true: the evil ``<>`` (as synonym for ``!=``) was introduced by previous (unsuccessful) [evil attempt](https://github.com/python/cpython/commit/e3944a5e1ecf67aa722fd9ce0c0a4ee72ee5ba2d) to overthrow Guido (Long live the BDFL!).

I think we are all in danger.  The attached patch (enables different tokenization just in the REPL) shows that in principle it's possible to apply ``__future__`` settings to the tokenizer too.  

<details>

<summary>The Holy Patch, That Save The World</summary>

```diff
diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c
index 7f25afec302..dd982a1f68d 100644
--- a/Parser/lexer/lexer.c
+++ b/Parser/lexer/lexer.c
@@ -1280,7 +1280,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
     /* Check for two-character token */
     {
         int c2 = tok_nextc(tok);
-        int current_token = _PyToken_TwoChars(c, c2);
+        int current_token = _PyToken_TwoChars(c, c2, tok->BARRY_AS_BDFL);
         if (current_token != OP) {
             int c3 = tok_nextc(tok);
             int current_token3 = _PyToken_ThreeChars(c, c2, c3);
diff --git a/Parser/lexer/state.c b/Parser/lexer/state.c
index 5cf9b4d768c..72daaa2a464 100644
--- a/Parser/lexer/state.c
+++ b/Parser/lexer/state.c
@@ -63,6 +63,7 @@ _PyTokenizer_tok_new(void)
 #ifdef Py_DEBUG
     tok->debug = _Py_GetConfig()->parser_debug;
 #endif
+    tok->BARRY_AS_BDFL = 0;
     return tok;
 }
 
diff --git a/Parser/lexer/state.h b/Parser/lexer/state.h
index 9cd196a114c..c367a4a0b7d 100644
--- a/Parser/lexer/state.h
+++ b/Parser/lexer/state.h
@@ -137,6 +137,7 @@ struct tok_state {
 #ifdef Py_DEBUG
     int debug;
 #endif
+    int BARRY_AS_BDFL;
 };
 
 int _PyLexer_type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
diff --git a/Parser/pegen.c b/Parser/pegen.c
index 569f5afb312..9954231cb41 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -1042,10 +1042,11 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen
                        PyCompilerFlags *flags, PyArena *arena, PyObject *module)
 {
     int exec_input = start_rule == Py_file_input;
+    int parser_flags = compute_parser_flags(flags);
 
     struct tok_state *tok;
     if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
-        tok = _PyTokenizer_FromUTF8(str, exec_input, 0);
+        tok = _PyTokenizer_FromUTF8(str, exec_input, 0, parser_flags & PyPARSE_BARRY_AS_BDFL);
     } else {
         tok = _PyTokenizer_FromString(str, exec_input, 0);
     }
@@ -1062,7 +1063,6 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen
     // We need to clear up from here on
     mod_ty result = NULL;
 
-    int parser_flags = compute_parser_flags(flags);
     int feature_version = flags && (flags->cf_flags & PyCF_ONLY_AST) ?
         flags->cf_feature_version : PY_MINOR_VERSION;
     Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, feature_version,
diff --git a/Parser/tokenizer/tokenizer.h b/Parser/tokenizer/tokenizer.h
index 8fbeb2d6ae6..bf04cf40a48 100644
--- a/Parser/tokenizer/tokenizer.h
+++ b/Parser/tokenizer/tokenizer.h
@@ -4,7 +4,7 @@
 #include "Python.h"
 
 struct tok_state *_PyTokenizer_FromString(const char *, int, int);
-struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
+struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int, int);
 struct tok_state *_PyTokenizer_FromReadline(PyObject*, const char*, int, int);
 struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
                                               const char *, const char *);

diff --git a/Parser/tokenizer/utf8_tokenizer.c b/Parser/tokenizer/utf8_tokenizer.c
index 1a925f44540..f31a7fe606a 100644
--- a/Parser/tokenizer/utf8_tokenizer.c
+++ b/Parser/tokenizer/utf8_tokenizer.c
@@ -28,7 +28,8 @@ tok_underflow_string(struct tok_state *tok) {
 
 /* Set up tokenizer for UTF-8 string */
 struct tok_state *
-_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf)
+_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf,
+                      int BARRY_AS_BDFL)
 {
     struct tok_state *tok = _PyTokenizer_tok_new();
     char *translated;
@@ -51,5 +52,6 @@ _PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf)
     tok->buf = tok->cur = tok->inp = translated;
     tok->end = translated;
     tok->underflow = &tok_underflow_string;
+    tok->BARRY_AS_BDFL = BARRY_AS_BDFL;
     return tok;
 }
diff --git a/Tools/build/generate_token.py b/Tools/build/generate_token.py
index 9ee5ec86e75..c7603e212d2 100755
--- a/Tools/build/generate_token.py
+++ b/Tools/build/generate_token.py
@@ -94,7 +94,7 @@ def update_file(file, content):
 // Export these 4 symbols for 'test_peg_generator'
 PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
 PyAPI_FUNC(int) _PyToken_OneChar(int);
-PyAPI_FUNC(int) _PyToken_TwoChars(int, int);
+PyAPI_FUNC(int) _PyToken_TwoChars(int, int, int);
 PyAPI_FUNC(int) _PyToken_ThreeChars(int, int, int);
 
 #ifdef __cplusplus
@@ -142,7 +142,7 @@ def make_h(infile, outfile='Include/internal/pycore_token.h'):
 }
 
 int
-_PyToken_TwoChars(int c1, int c2)
+_PyToken_TwoChars(int c1, int c2, int BARRY_AS_BDFL)
 {
 %s\
     return OP;
@@ -171,7 +171,10 @@ def generate_chars_to_token(mapping, n=1):
             write(indent)
             write('    break;\n')
         else:
-            write("case '%s': return %s;\n" % (c, value))
+            if c == '>' and value == 'NOTEQUAL':
+                write("case '%s': return BARRY_AS_BDFL ? %s : OP;\n" % (c, value))
+            else:
+                write("case '%s': return %s;\n" % (c, value))
     write(indent)
     write('}\n')
     return ''.join(result)
```

</details>

Should we apply something like this?  Of course, referenced issue could be solved and with the evil token.  Though, @warsaw might overthrow the SC.  BTW, Long live the FLUFL too!  (I think it's safer to glorify all dictators, just in case.)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Tokenizer produces token '<>', missing in the language grammar (regardless on future.barry_as_FLUFL settings) #151464

Metadata

Assignees

Labels

Fields

Projects

Milestone

Relationships

Development

Uh oh!

Tokenizer produces token '<>', missing in the language grammar (regardless on __future__.barry_as_FLUFL settings) #151464

Description

Metadata

Metadata

Assignees

Labels

Fields

Projects

Milestone

Relationships

Development

Issue actions

Tokenizer produces token '<>', missing in the language grammar (regardless on future.barry_as_FLUFL settings) #151464