Skip to content

Commit 50ef0d0

Browse files
committed
codec
1 parent 1aa3fa3 commit 50ef0d0

File tree

1 file changed

+60
-10
lines changed

1 file changed

+60
-10
lines changed

Lib/_pycodecs.py

Lines changed: 60 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1075,16 +1075,46 @@ def PyUnicode_EncodeUTF16(s, size, errors, byteorder="little"):
10751075
elif byteorder == "big":
10761076
bom = "big"
10771077

1078-
for c in s:
1079-
ch = ord(c)
1080-
ch2 = 0
1081-
if ch >= 0x10000:
1078+
i = 0
1079+
while i < len(s):
1080+
ch = ord(s[i])
1081+
1082+
# Check for surrogates - each surrogate is invalid in UTF-16
1083+
# regardless of whether it could form a pair
1084+
if 0xD800 <= ch <= 0xDFFF:
1085+
# Surrogate - handle with error handler
1086+
startinpos = i
1087+
endinpos = i + 1
1088+
res = unicode_call_errorhandler(
1089+
errors, "utf-16-le" if bom == "little" else "utf-16-be",
1090+
"surrogates not allowed", s, startinpos, endinpos
1091+
)
1092+
# res[0] is the replacement string, res[1] is the new position
1093+
for replacement_char in res[0]:
1094+
rch = ord(replacement_char)
1095+
if rch >= 0x10000:
1096+
# Encode as surrogate pair
1097+
rch2 = 0xDC00 | ((rch - 0x10000) & 0x3FF)
1098+
rch = 0xD800 | ((rch - 0x10000) >> 10)
1099+
p += STORECHAR(rch, bom)
1100+
p += STORECHAR(rch2, bom)
1101+
elif 0xD800 <= rch <= 0xDFFF:
1102+
# Don't encode surrogates in the replacement
1103+
pass
1104+
else:
1105+
p += STORECHAR(rch, bom)
1106+
i = res[1]
1107+
elif ch >= 0x10000:
1108+
# Regular character above BMP - encode as surrogate pair
10821109
ch2 = 0xDC00 | ((ch - 0x10000) & 0x3FF)
10831110
ch = 0xD800 | ((ch - 0x10000) >> 10)
1084-
1085-
p += STORECHAR(ch, bom)
1086-
if ch2:
1111+
p += STORECHAR(ch, bom)
10871112
p += STORECHAR(ch2, bom)
1113+
i += 1
1114+
else:
1115+
# Regular BMP character
1116+
p += STORECHAR(ch, bom)
1117+
i += 1
10881118

10891119
return p
10901120

@@ -1183,9 +1213,29 @@ def STORECHAR32(ch, byteorder):
11831213
if size == 0:
11841214
return p
11851215

1186-
for c in s:
1187-
ch = ord(c)
1188-
p += STORECHAR32(ch, bom)
1216+
i = 0
1217+
while i < len(s):
1218+
ch = ord(s[i])
1219+
1220+
# Check for surrogates - they are not valid in UTF-32
1221+
if 0xD800 <= ch <= 0xDFFF:
1222+
# Surrogate - handle with error handler
1223+
startinpos = i
1224+
endinpos = i + 1
1225+
res = unicode_call_errorhandler(
1226+
errors, "utf-32-le" if bom == "little" else "utf-32-be",
1227+
"surrogates not allowed", s, startinpos, endinpos, False
1228+
)
1229+
# res[0] is the replacement string, res[1] is the new position
1230+
for replacement_char in res[0]:
1231+
rch = ord(replacement_char)
1232+
# Don't encode surrogates in the replacement
1233+
if not (0xD800 <= rch <= 0xDFFF):
1234+
p += STORECHAR32(rch, bom)
1235+
i = res[1]
1236+
else:
1237+
p += STORECHAR32(ch, bom)
1238+
i += 1
11891239

11901240
return p
11911241

0 commit comments

Comments
 (0)