@@ -633,3 +633,161 @@ pub mod ascii {
633
633
)
634
634
}
635
635
}
636
+
637
+ pub mod utf16_le {
638
+ use super :: * ;
639
+
640
+ pub const ENCODING_NAME : & str = "utf-16-le" ;
641
+
642
+ pub fn encode < Ctx , E > ( mut ctx : Ctx , errors : & E ) -> Result < Vec < u8 > , Ctx :: Error >
643
+ where
644
+ Ctx : EncodeContext ,
645
+ E : EncodeErrorHandler < Ctx > ,
646
+ {
647
+ let mut out = Vec :: < u8 > :: new ( ) ;
648
+ loop {
649
+ let data = ctx. remaining_data ( ) ;
650
+ let error_info = {
651
+ let mut iter = iter_code_points ( data) ;
652
+ iter. find ( |( _, c) | c. to_u32 ( ) > 0x10FFFF )
653
+ } ;
654
+ let Some ( ( i, ch) ) = error_info else {
655
+ break ;
656
+ } ;
657
+
658
+ // Add valid part up to the error
659
+ for ch in data[ ..i. bytes ] . code_points ( ) {
660
+ let ch_u32 = ch. to_u32 ( ) ;
661
+ if ch_u32 <= 0xFFFF {
662
+ out. extend_from_slice ( & ( ch_u32 as u16 ) . to_le_bytes ( ) ) ;
663
+ } else if ch_u32 <= 0x10FFFF {
664
+ let code = ch_u32 - 0x10000 ;
665
+ let high = 0xD800 + ( code >> 10 ) ;
666
+ let low = 0xDC00 + ( code & 0x3FF ) ;
667
+ out. extend_from_slice ( & ( high as u16 ) . to_le_bytes ( ) ) ;
668
+ out. extend_from_slice ( & ( low as u16 ) . to_le_bytes ( ) ) ;
669
+ }
670
+ }
671
+
672
+ let err_start = ctx. position ( ) + i;
673
+ let err_end = StrSize { bytes : i. bytes + ch. len_wtf8 ( ) , chars : i. chars + 1 } ;
674
+ let err_end = ctx. position ( ) + err_end;
675
+ let replace = ctx. handle_error ( errors, err_start..err_end, Some ( "surrogates not allowed" ) ) ?;
676
+ match replace {
677
+ EncodeReplace :: Str ( s) => {
678
+ // Re-encode the replacement string
679
+ for cp in s. as_ref ( ) . code_points ( ) {
680
+ let cp_u32 = cp. to_u32 ( ) ;
681
+ if cp_u32 <= 0xFFFF {
682
+ out. extend_from_slice ( & ( cp_u32 as u16 ) . to_le_bytes ( ) ) ;
683
+ } else if cp_u32 <= 0x10FFFF {
684
+ let code = cp_u32 - 0x10000 ;
685
+ let high = 0xD800 + ( code >> 10 ) ;
686
+ let low = 0xDC00 + ( code & 0x3FF ) ;
687
+ out. extend_from_slice ( & ( high as u16 ) . to_le_bytes ( ) ) ;
688
+ out. extend_from_slice ( & ( low as u16 ) . to_le_bytes ( ) ) ;
689
+ }
690
+ }
691
+ }
692
+ EncodeReplace :: Bytes ( b) => {
693
+ out. extend_from_slice ( b. as_ref ( ) ) ;
694
+ }
695
+ }
696
+ }
697
+
698
+ // Process all remaining data
699
+ for ch in ctx. remaining_data ( ) . code_points ( ) {
700
+ let ch_u32 = ch. to_u32 ( ) ;
701
+ if ch_u32 <= 0xFFFF {
702
+ out. extend_from_slice ( & ( ch_u32 as u16 ) . to_le_bytes ( ) ) ;
703
+ } else if ch_u32 <= 0x10FFFF {
704
+ let code = ch_u32 - 0x10000 ;
705
+ let high = 0xD800 + ( code >> 10 ) ;
706
+ let low = 0xDC00 + ( code & 0x3FF ) ;
707
+ out. extend_from_slice ( & ( high as u16 ) . to_le_bytes ( ) ) ;
708
+ out. extend_from_slice ( & ( low as u16 ) . to_le_bytes ( ) ) ;
709
+ }
710
+ }
711
+ Ok ( out)
712
+ }
713
+
714
+ pub fn decode < Ctx : DecodeContext , E : DecodeErrorHandler < Ctx > > (
715
+ mut ctx : Ctx ,
716
+ errors : & E ,
717
+ final_decode : bool ,
718
+ ) -> Result < ( Wtf8Buf , usize ) , Ctx :: Error > {
719
+ let mut out = Wtf8Buf :: new ( ) ;
720
+
721
+ while ctx. remaining_data ( ) . len ( ) >= 2 {
722
+ let data = ctx. remaining_data ( ) ;
723
+ let ch = u16:: from_le_bytes ( [ data[ 0 ] , data[ 1 ] ] ) ;
724
+
725
+ if ch < 0xD800 || ch > 0xDFFF {
726
+ // BMP character
727
+ if let Some ( c) = char:: from_u32 ( ch as u32 ) {
728
+ out. push_str ( & c. to_string ( ) ) ;
729
+ ctx. advance ( 2 ) ;
730
+ } else {
731
+ let pos = ctx. position ( ) ;
732
+ let replace = ctx. handle_error ( errors, pos..pos + 2 , Some ( "invalid character" ) ) ?;
733
+ out. push_wtf8 ( replace. as_ref ( ) ) ;
734
+ // Don't advance here, the error handler already positioned us
735
+ }
736
+ } else if ch >= 0xD800 && ch <= 0xDBFF {
737
+ // High surrogate
738
+ if data. len ( ) < 4 {
739
+ if final_decode {
740
+ let pos = ctx. position ( ) ;
741
+ let replace = ctx. handle_error ( errors, pos..pos + 2 , Some ( "unexpected end of data" ) ) ?;
742
+ out. push_wtf8 ( replace. as_ref ( ) ) ;
743
+ // Don't advance here, the error handler already positioned us
744
+ } else {
745
+ // In partial mode, stop here and return what we have
746
+ break ;
747
+ }
748
+ } else {
749
+ let ch2 = u16:: from_le_bytes ( [ data[ 2 ] , data[ 3 ] ] ) ;
750
+ if ch2 >= 0xDC00 && ch2 <= 0xDFFF {
751
+ // Valid surrogate pair
752
+ let code = ( ( ( ch & 0x3FF ) as u32 ) << 10 ) | ( ( ch2 & 0x3FF ) as u32 ) ;
753
+ let code_point = code + 0x10000 ;
754
+ if let Some ( c) = char:: from_u32 ( code_point) {
755
+ out. push_str ( & c. to_string ( ) ) ;
756
+ ctx. advance ( 4 ) ;
757
+ } else {
758
+ let pos = ctx. position ( ) ;
759
+ let replace = ctx. handle_error ( errors, pos..pos + 4 , Some ( "invalid surrogate pair" ) ) ?;
760
+ out. push_wtf8 ( replace. as_ref ( ) ) ;
761
+ // Don't advance here, the error handler already positioned us
762
+ }
763
+ } else {
764
+ // Invalid surrogate pair
765
+ let pos = ctx. position ( ) ;
766
+ let replace = ctx. handle_error ( errors, pos..pos + 2 , Some ( "illegal UTF-16 surrogate" ) ) ?;
767
+ out. push_wtf8 ( replace. as_ref ( ) ) ;
768
+ // Don't advance here, the error handler already positioned us
769
+ }
770
+ }
771
+ } else {
772
+ // Low surrogate without high surrogate
773
+ let pos = ctx. position ( ) ;
774
+ let replace = ctx. handle_error ( errors, pos..pos + 2 , Some ( "illegal UTF-16 surrogate" ) ) ?;
775
+ out. push_wtf8 ( replace. as_ref ( ) ) ;
776
+ // Don't advance here, the error handler already positioned us
777
+ }
778
+ }
779
+
780
+ // Handle remaining single byte
781
+ if ctx. remaining_data ( ) . len ( ) == 1 {
782
+ if final_decode {
783
+ let pos = ctx. position ( ) ;
784
+ let replace = ctx. handle_error ( errors, pos..pos + 1 , Some ( "truncated data" ) ) ?;
785
+ out. push_wtf8 ( replace. as_ref ( ) ) ;
786
+ // Don't advance here, the error handler already positioned us
787
+ }
788
+ // In partial mode, just leave it for next call
789
+ }
790
+
791
+ Ok ( ( out, ctx. position ( ) ) )
792
+ }
793
+ }
0 commit comments