python · serhiy-storchaka · Oct 5, 2018 · Apr 15, 2023
diff --git a/Lib/codecs.py b/Lib/codecs.py
@@ -10,6 +10,9 @@
 import builtins
 import sys
 
+# The re module is imported lazily, see below
+re = None
+
 ### Registry and builtin stateless codec functions
 
 try:
@@ -506,7 +509,7 @@ def read(self, size=-1, chars=-1, firstline=False):
                 if firstline:
                     newchars, decodedbytes = \
                         self.decode(data[:exc.start], self.errors)
-                    lines = newchars.splitlines(keepends=True)
+                    lines = split_lines_keep_ends(newchars)
                     if len(lines)<=1:
                         raise
                 else:
@@ -548,7 +551,7 @@ def readline(self, size=None, keepends=True):
                 self.charbuffer = self.linebuffer[0]
                 self.linebuffer = None
             if not keepends:
-                line = line.splitlines(keepends=False)[0]
+                line = first_line_without_end(line)
             return line
 
         readsize = size or 72
@@ -565,7 +568,7 @@ def readline(self, size=None, keepends=True):
                     data += self.read(size=1, chars=1)
 
             line += data
-            lines = line.splitlines(keepends=True)
+            lines = split_lines_keep_ends(line)
             if lines:
                 if len(lines) > 1:
                     # More than one line result; the first line is a full line
@@ -581,10 +584,10 @@ def readline(self, size=None, keepends=True):
                         # only one remaining line, put it back into charbuffer
                         self.charbuffer = lines[0] + self.charbuffer
                     if not keepends:
-                        line = line.splitlines(keepends=False)[0]
+                        line = first_line_without_end(line)
                     break
                 line0withend = lines[0]
-                line0withoutend = lines[0].splitlines(keepends=False)[0]
+                line0withoutend = first_line_without_end(lines[0])
                 if line0withend != line0withoutend: # We really have a line end
                     # Put the rest back together and keep it until the next call
                     self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
@@ -597,7 +600,7 @@ def readline(self, size=None, keepends=True):
             # we didn't get anything or this was our only try
             if not data or size is not None:
                 if line and not keepends:
-                    line = line.splitlines(keepends=False)[0]
+                    line = first_line_without_end(line)
                 break
             if readsize < 8000:
                 readsize *= 2
@@ -616,7 +619,10 @@ def readlines(self, sizehint=None, keepends=True):
 
         """
         data = self.read()
-        return data.splitlines(keepends)
+        if keepends:
+            return split_lines_keep_ends(data)
+        else:
+            return split_lines_no_keep_ends(data)
 
     def reset(self):
 
@@ -819,7 +825,7 @@ def readlines(self, sizehint=None):
 
         data = self.reader.read()
         data, bytesencoded = self.encode(data, self.errors)
-        return data.splitlines(keepends=True)
+        return split_lines_keep_ends(data)
 
     def __next__(self):
 
@@ -1092,6 +1098,44 @@ def make_encoding_map(decoding_map):
             m[v] = None
     return m
 
+### Helpers for splitting lines
+# Regular expressions are used for splitting lines only on CR, LF and CRLF as
+# in io streams since str.splitlines() splits lines using Unicode line ends.
+
+# Lazy importing re and compiling regular expressions.
+# They are needed only when use codecs streams. The codecs module itself
+# is imported at startup time and should keep lightweight.
+def init_re():
+    global re, newline_re, after_newline_re, no_newline_re
+    import re
+    newline_re = re.compile(r'\n|\r\n?')
+    after_newline_re = re.compile(r'(?<=\n)|(?<=\r)(?!\n)')
+    no_newline_re = re.compile(r'[^\n\r]*')
+
+def split_lines_no_keep_ends(s):
+    if isinstance(s, str):
+        if re is None:
+            init_re()
+        return newline_re.split(s)
+    else:
+        return s.splitlines()
+
+def split_lines_keep_ends(s):
+    if isinstance(s, str):
+        if re is None:
+            init_re()
+        return after_newline_re.split(s)
+    else:
+        return s.splitlines(keepends=True)
+
+def first_line_without_end(s):
+    if isinstance(s, str):
+        if re is None:
+            init_re()
+        return no_newline_re.match(s).group()
+    else:
+        return s.splitlines()[0]
+
 ### error handlers
 
 try:

diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
@@ -168,14 +168,14 @@ def readalllines(input, keepends=True, size=None):
             return "|".join(lines)
 
         s = "foo\nbar\r\nbaz\rspam\u2028eggs"
-        sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
-        sexpectednoends = "foo|bar|baz|spam|eggs"
+        sexpected = "foo\n|bar\r\n|baz\r|spam\u2028eggs"
+        sexpectednoends = "foo|bar|baz|spam\u2028eggs"
         self.assertEqual(readalllines(s, True), sexpected)
         self.assertEqual(readalllines(s, False), sexpectednoends)
         self.assertEqual(readalllines(s, True, 10), sexpected)
         self.assertEqual(readalllines(s, False, 10), sexpectednoends)
 
-        lineends = ("\n", "\r\n", "\r", "\u2028")
+        lineends = ("\n", "\r\n", "\r")
         # Test long lines (multiple calls to read() in readline())
         vw = []
         vwo = []

diff --git a/Misc/NEWS.d/next/Library/2018-10-05-14-17-41.bpo-18291.cw37jn.rst b/Misc/NEWS.d/next/Library/2018-10-05-14-17-41.bpo-18291.cw37jn.rst
@@ -0,0 +1,4 @@
+:mod:`codecs` text streams now split lines only with ``'\r'``, ``'\n'`` and
+``'\r\n'`` as :mod:`io` text streams in :term:`universal newlines` mode.
+Previously they split lines using Unicode line terminators as in
+:meth:`str.splitlines`.