Skip to content

gh-62491: codecs text streams now split lines only with \r, \n and \r\n. #9711

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 52 additions & 8 deletions Lib/codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
import builtins
import sys

# The re module is imported lazily, see below
re = None

### Registry and builtin stateless codec functions

try:
Expand Down Expand Up @@ -506,7 +509,7 @@ def read(self, size=-1, chars=-1, firstline=False):
if firstline:
newchars, decodedbytes = \
self.decode(data[:exc.start], self.errors)
lines = newchars.splitlines(keepends=True)
lines = split_lines_keep_ends(newchars)
if len(lines)<=1:
raise
else:
Expand Down Expand Up @@ -548,7 +551,7 @@ def readline(self, size=None, keepends=True):
self.charbuffer = self.linebuffer[0]
self.linebuffer = None
if not keepends:
line = line.splitlines(keepends=False)[0]
line = first_line_without_end(line)
return line

readsize = size or 72
Expand All @@ -565,7 +568,7 @@ def readline(self, size=None, keepends=True):
data += self.read(size=1, chars=1)

line += data
lines = line.splitlines(keepends=True)
lines = split_lines_keep_ends(line)
if lines:
if len(lines) > 1:
# More than one line result; the first line is a full line
Expand All @@ -581,10 +584,10 @@ def readline(self, size=None, keepends=True):
# only one remaining line, put it back into charbuffer
self.charbuffer = lines[0] + self.charbuffer
if not keepends:
line = line.splitlines(keepends=False)[0]
line = first_line_without_end(line)
break
line0withend = lines[0]
line0withoutend = lines[0].splitlines(keepends=False)[0]
line0withoutend = first_line_without_end(lines[0])
if line0withend != line0withoutend: # We really have a line end
# Put the rest back together and keep it until the next call
self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
Expand All @@ -597,7 +600,7 @@ def readline(self, size=None, keepends=True):
# we didn't get anything or this was our only try
if not data or size is not None:
if line and not keepends:
line = line.splitlines(keepends=False)[0]
line = first_line_without_end(line)
break
if readsize < 8000:
readsize *= 2
Expand All @@ -616,7 +619,10 @@ def readlines(self, sizehint=None, keepends=True):

"""
data = self.read()
return data.splitlines(keepends)
if keepends:
return split_lines_keep_ends(data)
else:
return split_lines_no_keep_ends(data)

def reset(self):

Expand Down Expand Up @@ -819,7 +825,7 @@ def readlines(self, sizehint=None):

data = self.reader.read()
data, bytesencoded = self.encode(data, self.errors)
return data.splitlines(keepends=True)
return split_lines_keep_ends(data)

def __next__(self):

Expand Down Expand Up @@ -1092,6 +1098,44 @@ def make_encoding_map(decoding_map):
m[v] = None
return m

### Helpers for splitting lines
# Regular expressions are used for splitting lines only on CR, LF and CRLF as
# in io streams since str.splitlines() splits lines using Unicode line ends.

# Lazy importing re and compiling regular expressions.
# They are needed only when use codecs streams. The codecs module itself
# is imported at startup time and should keep lightweight.
def init_re():
global re, newline_re, after_newline_re, no_newline_re
import re
newline_re = re.compile(r'\n|\r\n?')
after_newline_re = re.compile(r'(?<=\n)|(?<=\r)(?!\n)')
no_newline_re = re.compile(r'[^\n\r]*')

def split_lines_no_keep_ends(s):
if isinstance(s, str):
if re is None:
init_re()
return newline_re.split(s)
else:
return s.splitlines()

def split_lines_keep_ends(s):
if isinstance(s, str):
if re is None:
init_re()
return after_newline_re.split(s)
else:
return s.splitlines(keepends=True)

def first_line_without_end(s):
if isinstance(s, str):
if re is None:
init_re()
return no_newline_re.match(s).group()
else:
return s.splitlines()[0]

### error handlers

try:
Expand Down
6 changes: 3 additions & 3 deletions Lib/test/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,14 +168,14 @@ def readalllines(input, keepends=True, size=None):
return "|".join(lines)

s = "foo\nbar\r\nbaz\rspam\u2028eggs"
sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
sexpectednoends = "foo|bar|baz|spam|eggs"
sexpected = "foo\n|bar\r\n|baz\r|spam\u2028eggs"
sexpectednoends = "foo|bar|baz|spam\u2028eggs"
self.assertEqual(readalllines(s, True), sexpected)
self.assertEqual(readalllines(s, False), sexpectednoends)
self.assertEqual(readalllines(s, True, 10), sexpected)
self.assertEqual(readalllines(s, False, 10), sexpectednoends)

lineends = ("\n", "\r\n", "\r", "\u2028")
lineends = ("\n", "\r\n", "\r")
# Test long lines (multiple calls to read() in readline())
vw = []
vwo = []
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
:mod:`codecs` text streams now split lines only with ``'\r'``, ``'\n'`` and
``'\r\n'`` as :mod:`io` text streams in :term:`universal newlines` mode.
Previously they split lines using Unicode line terminators as in
:meth:`str.splitlines`.