Skip to content

Commit 85766db

Browse files
miss-islingtonserhiy-storchakaezio-melotti
authored
[3.10] gh-102555: Fix comment parsing in HTMLParser according to the HTML5 standard (GH-135664) (GH-136275)
* "--!>" now ends the comment. * "-- >" no longer ends the comment. * Support abnormally ended empty comments "<-->" and "<--->". --------- (cherry picked from commit 8ac7613) Co-author: Kerim Kabirov <the.privat33r+gh@pm.me> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com>
1 parent fdc9d21 commit 85766db

File tree

3 files changed

+50
-3
lines changed

3 files changed

+50
-3
lines changed

Lib/html/parser.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@
2727
starttagopen = re.compile('<[a-zA-Z]')
2828
endtagopen = re.compile('</[a-zA-Z]')
2929
piclose = re.compile('>')
30-
commentclose = re.compile(r'--\s*>')
30+
commentclose = re.compile(r'--!?>')
31+
commentabruptclose = re.compile(r'-?>')
3132
# Note:
3233
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
3334
# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
@@ -290,6 +291,21 @@ def parse_html_declaration(self, i):
290291
else:
291292
return self.parse_bogus_comment(i)
292293

294+
# Internal -- parse comment, return length or -1 if not terminated
295+
# see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
296+
def parse_comment(self, i, report=True):
297+
rawdata = self.rawdata
298+
assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()'
299+
match = commentclose.search(rawdata, i+4)
300+
if not match:
301+
match = commentabruptclose.match(rawdata, i+4)
302+
if not match:
303+
return -1
304+
if report:
305+
j = match.start()
306+
self.handle_comment(rawdata[i+4: j])
307+
return match.end()
308+
293309
# Internal -- parse bogus comment, return length or -1 if not terminated
294310
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
295311
def parse_bogus_comment(self, i, report=1):

Lib/test/test_htmlparser.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -321,17 +321,45 @@ def test_comments(self):
321321
html = ("<!-- I'm a valid comment -->"
322322
'<!--me too!-->'
323323
'<!------>'
324+
'<!----->'
324325
'<!---->'
326+
# abrupt-closing-of-empty-comment
327+
'<!--->'
328+
'<!-->'
325329
'<!----I have many hyphens---->'
326330
'<!-- I have a > in the middle -->'
327-
'<!-- and I have -- in the middle! -->')
331+
'<!-- and I have -- in the middle! -->'
332+
'<!--incorrectly-closed-comment--!>'
333+
'<!----!>'
334+
'<!----!-->'
335+
'<!---- >-->'
336+
'<!---!>-->'
337+
'<!--!>-->'
338+
# nested-comment
339+
'<!-- <!-- nested --> -->'
340+
'<!--<!-->'
341+
'<!--<!--!>'
342+
)
328343
expected = [('comment', " I'm a valid comment "),
329344
('comment', 'me too!'),
330345
('comment', '--'),
346+
('comment', '-'),
347+
('comment', ''),
348+
('comment', ''),
331349
('comment', ''),
332350
('comment', '--I have many hyphens--'),
333351
('comment', ' I have a > in the middle '),
334-
('comment', ' and I have -- in the middle! ')]
352+
('comment', ' and I have -- in the middle! '),
353+
('comment', 'incorrectly-closed-comment'),
354+
('comment', ''),
355+
('comment', '--!'),
356+
('comment', '-- >'),
357+
('comment', '-!>'),
358+
('comment', '!>'),
359+
('comment', ' <!-- nested '), ('data', ' -->'),
360+
('comment', '<!'),
361+
('comment', '<!'),
362+
]
335363
self._run_check(html, expected)
336364

337365
def test_condcoms(self):
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix comment parsing in :class:`html.parser.HTMLParser` according to the
2+
HTML5 standard. ``--!>`` now ends the comment. ``-- >`` no longer ends the
3+
comment. Support abnormally ended empty comments ``<-->`` and ``<--->``.

0 commit comments

Comments
 (0)