Skip to content

Commit a5c80eb

Browse files
committed
Implement utf16-le
1 parent 1d36034 commit a5c80eb

File tree

3 files changed

+162
-10
lines changed

3 files changed

+162
-10
lines changed

Lib/test/test_codecs.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -878,8 +878,6 @@ class UTF16LETest(ReadTest, unittest.TestCase):
878878
encoding = "utf-16-le"
879879
ill_formed_sequence = b"\x80\xdc"
880880

881-
# TODO: RUSTPYTHON
882-
@unittest.expectedFailure
883881
def test_partial(self):
884882
self.check_partial(
885883
"\x00\xff\u0100\uffff\U00010000",
@@ -922,10 +920,6 @@ def test_nonbmp(self):
922920
self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
923921
"\U00010203")
924922

925-
# TODO: RUSTPYTHON
926-
@unittest.expectedFailure
927-
def test_incremental_surrogatepass(self):
928-
super().test_incremental_surrogatepass()
929923

930924
class UTF16BETest(ReadTest, unittest.TestCase):
931925
encoding = "utf-16-be"

common/src/encodings.rs

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,3 +633,161 @@ pub mod ascii {
633633
)
634634
}
635635
}
636+
637+
pub mod utf16_le {
638+
use super::*;
639+
640+
pub const ENCODING_NAME: &str = "utf-16-le";
641+
642+
pub fn encode<Ctx, E>(mut ctx: Ctx, errors: &E) -> Result<Vec<u8>, Ctx::Error>
643+
where
644+
Ctx: EncodeContext,
645+
E: EncodeErrorHandler<Ctx>,
646+
{
647+
let mut out = Vec::<u8>::new();
648+
loop {
649+
let data = ctx.remaining_data();
650+
let error_info = {
651+
let mut iter = iter_code_points(data);
652+
iter.find(|(_, c)| c.to_u32() > 0x10FFFF)
653+
};
654+
let Some((i, ch)) = error_info else {
655+
break;
656+
};
657+
658+
// Add valid part up to the error
659+
for ch in data[..i.bytes].code_points() {
660+
let ch_u32 = ch.to_u32();
661+
if ch_u32 <= 0xFFFF {
662+
out.extend_from_slice(&(ch_u32 as u16).to_le_bytes());
663+
} else if ch_u32 <= 0x10FFFF {
664+
let code = ch_u32 - 0x10000;
665+
let high = 0xD800 + (code >> 10);
666+
let low = 0xDC00 + (code & 0x3FF);
667+
out.extend_from_slice(&(high as u16).to_le_bytes());
668+
out.extend_from_slice(&(low as u16).to_le_bytes());
669+
}
670+
}
671+
672+
let err_start = ctx.position() + i;
673+
let err_end = StrSize { bytes: i.bytes + ch.len_wtf8(), chars: i.chars + 1 };
674+
let err_end = ctx.position() + err_end;
675+
let replace = ctx.handle_error(errors, err_start..err_end, Some("surrogates not allowed"))?;
676+
match replace {
677+
EncodeReplace::Str(s) => {
678+
// Re-encode the replacement string
679+
for cp in s.as_ref().code_points() {
680+
let cp_u32 = cp.to_u32();
681+
if cp_u32 <= 0xFFFF {
682+
out.extend_from_slice(&(cp_u32 as u16).to_le_bytes());
683+
} else if cp_u32 <= 0x10FFFF {
684+
let code = cp_u32 - 0x10000;
685+
let high = 0xD800 + (code >> 10);
686+
let low = 0xDC00 + (code & 0x3FF);
687+
out.extend_from_slice(&(high as u16).to_le_bytes());
688+
out.extend_from_slice(&(low as u16).to_le_bytes());
689+
}
690+
}
691+
}
692+
EncodeReplace::Bytes(b) => {
693+
out.extend_from_slice(b.as_ref());
694+
}
695+
}
696+
}
697+
698+
// Process all remaining data
699+
for ch in ctx.remaining_data().code_points() {
700+
let ch_u32 = ch.to_u32();
701+
if ch_u32 <= 0xFFFF {
702+
out.extend_from_slice(&(ch_u32 as u16).to_le_bytes());
703+
} else if ch_u32 <= 0x10FFFF {
704+
let code = ch_u32 - 0x10000;
705+
let high = 0xD800 + (code >> 10);
706+
let low = 0xDC00 + (code & 0x3FF);
707+
out.extend_from_slice(&(high as u16).to_le_bytes());
708+
out.extend_from_slice(&(low as u16).to_le_bytes());
709+
}
710+
}
711+
Ok(out)
712+
}
713+
714+
pub fn decode<Ctx: DecodeContext, E: DecodeErrorHandler<Ctx>>(
715+
mut ctx: Ctx,
716+
errors: &E,
717+
final_decode: bool,
718+
) -> Result<(Wtf8Buf, usize), Ctx::Error> {
719+
let mut out = Wtf8Buf::new();
720+
721+
while ctx.remaining_data().len() >= 2 {
722+
let data = ctx.remaining_data();
723+
let ch = u16::from_le_bytes([data[0], data[1]]);
724+
725+
if ch < 0xD800 || ch > 0xDFFF {
726+
// BMP character
727+
if let Some(c) = char::from_u32(ch as u32) {
728+
out.push_str(&c.to_string());
729+
ctx.advance(2);
730+
} else {
731+
let pos = ctx.position();
732+
let replace = ctx.handle_error(errors, pos..pos + 2, Some("invalid character"))?;
733+
out.push_wtf8(replace.as_ref());
734+
// Don't advance here, the error handler already positioned us
735+
}
736+
} else if ch >= 0xD800 && ch <= 0xDBFF {
737+
// High surrogate
738+
if data.len() < 4 {
739+
if final_decode {
740+
let pos = ctx.position();
741+
let replace = ctx.handle_error(errors, pos..pos + 2, Some("unexpected end of data"))?;
742+
out.push_wtf8(replace.as_ref());
743+
// Don't advance here, the error handler already positioned us
744+
} else {
745+
// In partial mode, stop here and return what we have
746+
break;
747+
}
748+
} else {
749+
let ch2 = u16::from_le_bytes([data[2], data[3]]);
750+
if ch2 >= 0xDC00 && ch2 <= 0xDFFF {
751+
// Valid surrogate pair
752+
let code = (((ch & 0x3FF) as u32) << 10) | ((ch2 & 0x3FF) as u32);
753+
let code_point = code + 0x10000;
754+
if let Some(c) = char::from_u32(code_point) {
755+
out.push_str(&c.to_string());
756+
ctx.advance(4);
757+
} else {
758+
let pos = ctx.position();
759+
let replace = ctx.handle_error(errors, pos..pos + 4, Some("invalid surrogate pair"))?;
760+
out.push_wtf8(replace.as_ref());
761+
// Don't advance here, the error handler already positioned us
762+
}
763+
} else {
764+
// Invalid surrogate pair
765+
let pos = ctx.position();
766+
let replace = ctx.handle_error(errors, pos..pos + 2, Some("illegal UTF-16 surrogate"))?;
767+
out.push_wtf8(replace.as_ref());
768+
// Don't advance here, the error handler already positioned us
769+
}
770+
}
771+
} else {
772+
// Low surrogate without high surrogate
773+
let pos = ctx.position();
774+
let replace = ctx.handle_error(errors, pos..pos + 2, Some("illegal UTF-16 surrogate"))?;
775+
out.push_wtf8(replace.as_ref());
776+
// Don't advance here, the error handler already positioned us
777+
}
778+
}
779+
780+
// Handle remaining single byte
781+
if ctx.remaining_data().len() == 1 {
782+
if final_decode {
783+
let pos = ctx.position();
784+
let replace = ctx.handle_error(errors, pos..pos + 1, Some("truncated data"))?;
785+
out.push_wtf8(replace.as_ref());
786+
// Don't advance here, the error handler already positioned us
787+
}
788+
// In partial mode, just leave it for next call
789+
}
790+
791+
Ok((out, ctx.position()))
792+
}
793+
}

vm/src/stdlib/codecs.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -286,12 +286,12 @@ mod _codecs {
286286
delegate_pycodecs!(charmap_build, args, vm)
287287
}
288288
#[pyfunction]
289-
fn utf_16_le_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {
290-
delegate_pycodecs!(utf_16_le_encode, args, vm)
289+
fn utf_16_le_encode(args: EncodeArgs, vm: &VirtualMachine) -> EncodeResult {
290+
do_codec!(utf16_le::encode, args, vm)
291291
}
292292
#[pyfunction]
293-
fn utf_16_le_decode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {
294-
delegate_pycodecs!(utf_16_le_decode, args, vm)
293+
fn utf_16_le_decode(args: DecodeArgs, vm: &VirtualMachine) -> DecodeResult {
294+
do_codec!(utf16_le::decode, args, vm)
295295
}
296296
#[pyfunction]
297297
fn utf_16_be_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {

0 commit comments

Comments
 (0)