@@ -1075,16 +1075,46 @@ def PyUnicode_EncodeUTF16(s, size, errors, byteorder="little"):
1075
1075
elif byteorder == "big" :
1076
1076
bom = "big"
1077
1077
1078
- for c in s :
1079
- ch = ord (c )
1080
- ch2 = 0
1081
- if ch >= 0x10000 :
1078
+ i = 0
1079
+ while i < len (s ):
1080
+ ch = ord (s [i ])
1081
+
1082
+ # Check for surrogates - each surrogate is invalid in UTF-16
1083
+ # regardless of whether it could form a pair
1084
+ if 0xD800 <= ch <= 0xDFFF :
1085
+ # Surrogate - handle with error handler
1086
+ startinpos = i
1087
+ endinpos = i + 1
1088
+ res = unicode_call_errorhandler (
1089
+ errors , "utf-16-le" if bom == "little" else "utf-16-be" ,
1090
+ "surrogates not allowed" , s , startinpos , endinpos
1091
+ )
1092
+ # res[0] is the replacement string, res[1] is the new position
1093
+ for replacement_char in res [0 ]:
1094
+ rch = ord (replacement_char )
1095
+ if rch >= 0x10000 :
1096
+ # Encode as surrogate pair
1097
+ rch2 = 0xDC00 | ((rch - 0x10000 ) & 0x3FF )
1098
+ rch = 0xD800 | ((rch - 0x10000 ) >> 10 )
1099
+ p += STORECHAR (rch , bom )
1100
+ p += STORECHAR (rch2 , bom )
1101
+ elif 0xD800 <= rch <= 0xDFFF :
1102
+ # Don't encode surrogates in the replacement
1103
+ pass
1104
+ else :
1105
+ p += STORECHAR (rch , bom )
1106
+ i = res [1 ]
1107
+ elif ch >= 0x10000 :
1108
+ # Regular character above BMP - encode as surrogate pair
1082
1109
ch2 = 0xDC00 | ((ch - 0x10000 ) & 0x3FF )
1083
1110
ch = 0xD800 | ((ch - 0x10000 ) >> 10 )
1084
-
1085
- p += STORECHAR (ch , bom )
1086
- if ch2 :
1111
+ p += STORECHAR (ch , bom )
1087
1112
p += STORECHAR (ch2 , bom )
1113
+ i += 1
1114
+ else :
1115
+ # Regular BMP character
1116
+ p += STORECHAR (ch , bom )
1117
+ i += 1
1088
1118
1089
1119
return p
1090
1120
@@ -1183,9 +1213,29 @@ def STORECHAR32(ch, byteorder):
1183
1213
if size == 0 :
1184
1214
return p
1185
1215
1186
- for c in s :
1187
- ch = ord (c )
1188
- p += STORECHAR32 (ch , bom )
1216
+ i = 0
1217
+ while i < len (s ):
1218
+ ch = ord (s [i ])
1219
+
1220
+ # Check for surrogates - they are not valid in UTF-32
1221
+ if 0xD800 <= ch <= 0xDFFF :
1222
+ # Surrogate - handle with error handler
1223
+ startinpos = i
1224
+ endinpos = i + 1
1225
+ res = unicode_call_errorhandler (
1226
+ errors , "utf-32-le" if bom == "little" else "utf-32-be" ,
1227
+ "surrogates not allowed" , s , startinpos , endinpos , False
1228
+ )
1229
+ # res[0] is the replacement string, res[1] is the new position
1230
+ for replacement_char in res [0 ]:
1231
+ rch = ord (replacement_char )
1232
+ # Don't encode surrogates in the replacement
1233
+ if not (0xD800 <= rch <= 0xDFFF ):
1234
+ p += STORECHAR32 (rch , bom )
1235
+ i = res [1 ]
1236
+ else :
1237
+ p += STORECHAR32 (ch , bom )
1238
+ i += 1
1189
1239
1190
1240
return p
1191
1241
0 commit comments