|
1 | 1 | from __future__ import absolute_import, division, unicode_literals
|
2 | 2 |
|
3 | 3 | import json
|
| 4 | +import platform |
4 | 5 | import warnings
|
5 | 6 | import re
|
6 | 7 |
|
@@ -122,9 +123,26 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
|
122 | 123 | return tokens["expected"] == tokens["received"]
|
123 | 124 |
|
124 | 125 |
|
| 126 | +_surrogateRe = re.compile(r"\\u(?P<codepoint>[0-9A-Fa-f]{4})") |
| 127 | + |
| 128 | + |
125 | 129 | def unescape(test):
|
126 | 130 | def decode(inp):
|
127 |
| - return inp.encode("utf-8").decode("unicode-escape") |
| 131 | + try: |
| 132 | + return inp.encode("utf-8").decode("unicode-escape") |
| 133 | + except UnicodeDecodeError: |
| 134 | + possible_surrogate_match = _surrogateRe.search(inp) |
| 135 | + if possible_surrogate_match and platform.python_implementation() == "Jython": |
| 136 | + possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16) |
| 137 | + if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF: |
| 138 | + # Not valid unicode input for Jython. |
| 139 | + # |
| 140 | + # NOTE it's not even possible to have such |
| 141 | + # isolated surrogates in unicode input streams in |
| 142 | + # Jython - the decoding to unicode would have |
| 143 | + # raised a similar UnicodeDecodeError. |
| 144 | + return None |
| 145 | + raise |
128 | 146 |
|
129 | 147 | test["input"] = decode(test["input"])
|
130 | 148 | for token in test["output"]:
|
@@ -183,6 +201,8 @@ def testTokenizer():
|
183 | 201 | test["initialStates"] = ["Data state"]
|
184 | 202 | if 'doubleEscaped' in test:
|
185 | 203 | test = unescape(test)
|
| 204 | + if test["input"] is None: |
| 205 | + continue # Not valid input for this platform |
186 | 206 | for initialState in test["initialStates"]:
|
187 | 207 | test["initialState"] = capitalize(initialState)
|
188 | 208 | yield runTokenizerTest, test
|
0 commit comments