Skip to content

Commit 7f189f8

Browse files
committed
Pass on constructed tests in test_tokenizer that attempt to build
HTMLUnicodeInputStream objects from unicode strings that contain isolated surrogates. Such tests are not meaningful on Jython which does not allow for invalid unicode strings to be decoded in the first place.
1 parent a6c4b41 commit 7f189f8

File tree

1 file changed

+21
-1
lines changed

1 file changed

+21
-1
lines changed

html5lib/tests/test_tokenizer.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import absolute_import, division, unicode_literals
22

33
import json
4+
import platform
45
import warnings
56
import re
67

@@ -122,9 +123,26 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
122123
return tokens["expected"] == tokens["received"]
123124

124125

126+
_surrogateRe = re.compile(r"\\u(?P<codepoint>[0-9A-Fa-f]{4})")
127+
128+
125129
def unescape(test):
126130
def decode(inp):
127-
return inp.encode("utf-8").decode("unicode-escape")
131+
try:
132+
return inp.encode("utf-8").decode("unicode-escape")
133+
except UnicodeDecodeError:
134+
possible_surrogate_match = _surrogateRe.search(inp)
135+
if possible_surrogate_match and platform.python_implementation() == "Jython":
136+
possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16)
137+
if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF:
138+
# Not valid unicode input for Jython.
139+
#
140+
# NOTE it's not even possible to have such
141+
# isolated surrogates in unicode input streams in
142+
# Jython - the decoding to unicode would have
143+
# raised a similar UnicodeDecodeError.
144+
return None
145+
raise
128146

129147
test["input"] = decode(test["input"])
130148
for token in test["output"]:
@@ -183,6 +201,8 @@ def testTokenizer():
183201
test["initialStates"] = ["Data state"]
184202
if 'doubleEscaped' in test:
185203
test = unescape(test)
204+
if test["input"] is None:
205+
continue # Not valid input for this platform
186206
for initialState in test["initialStates"]:
187207
test["initialState"] = capitalize(initialState)
188208
yield runTokenizerTest, test

0 commit comments

Comments
 (0)