Move adjacent ew detection to get_phrase

medmunds · bitdancer · medmunds · commit 5a92b2ae1cb4 · 2026-05-11T13:13:22.000-07:00
Switch to @bitdancer's fix from review feedback. Recharacterize space between ews as fws after parsing in get_phrase (rather than peeking ahead after first ew in get_word). Co-authored-by: R David Murray <rdmurray@bitdance.com>
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
@@ -1352,18 +1352,6 @@ def get_atom(value):
     atom.append(token)
     if value and value[0] in CFWS_LEADER:
         token, value = get_cfws(value)
-        # Peek ahead to ignore linear-white-space between adjacent encoded-words.
-        if (
-            atom[-1].token_type == 'encoded-word'
-            and value.startswith('=?')
-            and all(ws.token_type == 'fws' for ws in token)  # not comments
-        ):
-            try:
-                get_encoded_word(value)
-            except errors.HeaderParseError:
-                pass
-            else:
-                token = EWWhiteSpaceTerminal(token, 'fws')
         atom.append(token)
     return atom, value
 
@@ -1473,6 +1461,16 @@ def get_phrase(value):
         else:
             try:
                 token, value = get_word(value)
+                if (token[0].token_type == 'encoded-word'
+                    and phrase
+                    and phrase[-1].token_type == 'atom'
+                    and len(phrase[-1]) > 1
+                    and phrase[-1][-2].token_type == 'encoded-word'
+                    and phrase[-1][-1].token_type == 'cfws'
+                    and not phrase[-1][-1].comments
+                ):
+                    # linear ws between ews needs special handing...
+                    phrase[-1][-1] = EWWhiteSpaceTerminal(phrase[-1], 'fws')
             except errors.HeaderParseError:
                 if value[0] in CFWS_LEADER:
                     token, value = get_cfws(value)
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py
@@ -1061,9 +1061,8 @@ def get_phrase_cfws_only_raises(self):
             parser.get_phrase(' (foo) ')
 
     def test_get_phrase_adjacent_ew(self):
-        # In structured headers, the requirement to ignore linear-white-space
-        # between adjacent encoded-words is actually implemented by get_atom.
-        # But it's easier to see the results by testing get_phrase.
+        # "'linear-white-space' that separates a pair of adjacent
+        # 'encoded-word's is ignored" (rfc2047 section 6.2)
         self._test_get_x(parser.get_phrase, '=?ascii?q?Joi?= \t =?ascii?q?ned?=', 'Joined', 'Joined', [], '')
 
     def test_get_phrase_adjacent_ew_different_encodings(self):