Skip to content

gh-118350: Fix support of elements "textarea" and "title" in HTMLParser #135310

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions Lib/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,8 @@ class HTMLParser(_markupbase.ParserBase):
containing respectively the named or numeric reference as the
argument.
"""

CDATA_CONTENT_ELEMENTS = ("script", "style")
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")

def __init__(self, *, convert_charrefs=True):
"""Initialize and reset this instance.
Expand All @@ -145,6 +145,7 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
self._escapable = True
super().reset()

def feed(self, data):
Expand All @@ -166,14 +167,20 @@ def get_starttag_text(self):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text

def set_cdata_mode(self, elem):
def set_cdata_mode(self, elem, escapable=False):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
re.IGNORECASE|re.ASCII)
self._escapable = escapable
if escapable and not self.convert_charrefs:
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
re.IGNORECASE|re.ASCII)
else:
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
re.IGNORECASE|re.ASCII)

def clear_cdata_mode(self):
self.interesting = interesting_normal
self.cdata_elem = None
self._escapable = True

# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
Expand Down Expand Up @@ -206,7 +213,7 @@ def goahead(self, end):
break
j = n
if i < j:
if self.convert_charrefs and not self.cdata_elem:
if self.convert_charrefs and self._escapable:
self.handle_data(unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
Expand Down Expand Up @@ -308,7 +315,7 @@ def goahead(self, end):
assert 0, "interesting.search() lied"
# end while
if end and i < n:
if self.convert_charrefs and not self.cdata_elem:
if self.convert_charrefs and self._escapable:
self.handle_data(unescape(rawdata[i:n]))
else:
self.handle_data(rawdata[i:n])
Expand Down Expand Up @@ -420,6 +427,8 @@ def parse_starttag(self, i):
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
elif tag in self.RCDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag, True)
return endpos

# Internal -- check to see if we have a complete starttag; return end
Expand Down
96 changes: 96 additions & 0 deletions Lib/test/test_htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,49 @@ def test_style_content(self, content):
("data", content),
("endtag", "style")])

@support.subTests('content', [
'<!-- not a comment -->',
"<not a='start tag'>",
'<![CDATA[not a cdata]]>',
'<!not a bogus comment>',
'</not a bogus comment>',
'\u2603',
'< /title>',
'</ title>',
'</titled>',
'</title\v>',
'</title\xa0>',
'</tıtle>',
])
def test_title_content(self, content):
source = f"<title>{content}</title>"
self._run_check(source, [
("starttag", "title", []),
("data", content),
("endtag", "title"),
])

@support.subTests('content', [
'<!-- not a comment -->',
"<not a='start tag'>",
'<![CDATA[not a cdata]]>',
'<!not a bogus comment>',
'</not a bogus comment>',
'\u2603',
'< /textarea>',
'</ textarea>',
'</textareable>',
'</textarea\v>',
'</textarea\xa0>',
])
def test_textarea_content(self, content):
source = f"<textarea>{content}</textarea>"
self._run_check(source, [
("starttag", "textarea", []),
("data", content),
("endtag", "textarea"),
])

@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
'script/', 'script foo=bar', 'script foo=">"'])
def test_script_closing_tag(self, endtag):
Expand Down Expand Up @@ -346,6 +389,38 @@ def test_style_closing_tag(self, endtag):
("endtag", "style")],
collector=EventCollectorNoNormalize(convert_charrefs=False))

@support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n',
'title/', 'title foo=bar', 'title foo=">"'])
def test_title_closing_tag(self, endtag):
content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
s = f'<TitLe>{content}</{endtag}>'
self._run_check(s, [("starttag", "title", []),
('data', '<!-- not a comment --><i>Egg & Spam</i>'),
("endtag", "title")],
collector=EventCollectorNoNormalize(convert_charrefs=True))
self._run_check(s, [("starttag", "title", []),
('data', '<!-- not a comment --><i>Egg '),
('entityref', 'amp'),
('data', ' Spam</i>'),
("endtag", "title")],
collector=EventCollectorNoNormalize(convert_charrefs=False))

@support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n',
'textarea/', 'textarea foo=bar', 'textarea foo=">"'])
def test_textarea_closing_tag(self, endtag):
content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
s = f'<TexTarEa>{content}</{endtag}>'
self._run_check(s, [("starttag", "textarea", []),
('data', '<!-- not a comment --><i>Egg & Spam</i>'),
("endtag", "textarea")],
collector=EventCollectorNoNormalize(convert_charrefs=True))
self._run_check(s, [("starttag", "textarea", []),
('data', '<!-- not a comment --><i>Egg '),
('entityref', 'amp'),
('data', ' Spam</i>'),
("endtag", "textarea")],
collector=EventCollectorNoNormalize(convert_charrefs=False))

@support.subTests('tail,end', [
('', False),
('<', False),
Expand All @@ -363,6 +438,27 @@ def test_eof_in_script(self, tail, end):
("data", content if end else content + tail)],
collector=EventCollectorNoNormalize(convert_charrefs=False))

@support.subTests('tail,end', [
('', False),
('<', False),
('</', False),
('</t', False),
('</title', False),
('</title ', True),
('</title foo=bar', True),
('</title foo=">', True),
])
def test_eof_in_title(self, tail, end):
s = f'<TitLe>Egg &amp; Spam{tail}'
self._run_check(s, [("starttag", "title", []),
("data", "Egg & Spam" + ('' if end else tail))],
collector=EventCollectorNoNormalize(convert_charrefs=True))
self._run_check(s, [("starttag", "title", []),
('data', 'Egg '),
('entityref', 'amp'),
('data', ' Spam' + ('' if end else tail))],
collector=EventCollectorNoNormalize(convert_charrefs=False))

def test_comments(self):
html = ("<!-- I'm a valid comment -->"
'<!--me too!-->'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix support of escapable raw text mode (elements "textarea" and "title")
in :class:`html.parser.HTMLParser`.
Loading