Skip to content

Commit e12e88d

Browse files
Check also for surrogates in HTML.
1 parent 22e5543 commit e12e88d

2 files changed

Lines changed: 42 additions & 13 deletions

File tree

Lib/test/test_xml_etree.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1400,6 +1400,10 @@ def check_valid(self, elem, expected):
14001400
def test_invalid_comment(self):
14011401
self.check(ET.Comment('a--b'))
14021402
self.check(ET.Comment(' B+, B, or B-'))
1403+
self.check(ET.Comment('\x00'))
1404+
self.check(ET.Comment('\x01'))
1405+
self.check(ET.Comment('\ud8ff'))
1406+
self.check(ET.Comment('\ufffe'))
14031407

14041408
def test_invalid_processing_instruction(self):
14051409
self.check(ET.PI(''))
@@ -1412,6 +1416,7 @@ def test_invalid_processing_instruction(self):
14121416
self.check(ET.PI('xml', 'encoding="UTF-8"'))
14131417
self.check(ET.PI('foo', 'a?>b'))
14141418
self.check(ET.PI('foo', '\x00'))
1419+
self.check(ET.PI('foo', '\x01'))
14151420
self.check(ET.PI('foo', '\ud8ff'))
14161421
self.check(ET.PI('foo', '\ufffe'))
14171422

@@ -1500,10 +1505,12 @@ def test_invalid_comment(self):
15001505
self.check(ET.Comment('a-->b'))
15011506
self.check(ET.Comment('a--!>b'))
15021507
self.check(ET.Comment('a\x00b'))
1508+
self.check(ET.Comment('a\ud8ffb'))
15031509

15041510
def test_invalid_processing_instruction(self):
15051511
self.check(ET.PI('a>b'))
15061512
self.check(ET.PI('a\x00b'))
1513+
self.check(ET.PI('a\ud8ffb'))
15071514

15081515
def test_invalid_tag(self):
15091516
self.check(ET.Element(''))
@@ -1516,37 +1523,50 @@ def test_invalid_tag(self):
15161523
self.check(ET.Element('a/b'))
15171524
self.check(ET.Element('a>b'))
15181525
self.check(ET.Element('a\x00b'))
1526+
self.check(ET.Element('a\ud8ffb'))
15191527
self.check(ET.Element(ET.QName('')))
15201528
self.check(ET.Element(ET.QName('0')))
15211529
self.check(ET.Element(ET.QName('a/b')))
15221530

15231531
def test_invalid_attr_name(self):
15241532
self.check(ET.Element('tag', attrib={'': 'value'}))
1533+
self.check(ET.Element('tag', attrib={'\x00': 'value'}))
1534+
self.check(ET.Element('tag', attrib={'\ud8ff': 'value'}))
15251535
self.check(ET.Element('tag', attrib={'a/b': 'value'}))
15261536
self.check(ET.Element('tag', attrib={'a=b': 'value'}))
1537+
self.check(ET.Element('tag', attrib={'a\x00b': 'value'}))
1538+
self.check(ET.Element('tag', attrib={'a\ud8ffb': 'value'}))
15271539
self.check(ET.Element('tag', attrib={ET.QName(''): 'value'}))
15281540
self.check(ET.Element('tag', attrib={ET.QName('a/b'): 'value'}))
15291541

15301542
def test_invalid_attr_value(self):
15311543
self.check(ET.Element('tag', attrib={'key': '\x00'}))
1544+
self.check(ET.Element('tag', attrib={'key': '\ud8ff'}))
15321545
self.check(ET.Element('tag', attrib={'key': ET.QName('\x00')}))
1546+
self.check(ET.Element('tag', attrib={'key': ET.QName('\ud8ff')}))
15331547
self.check(ET.Element('tag', attrib={'key': ET.QName('a"b')}))
15341548
self.check(ET.Element('tag', attrib={'key': ET.QName('a&b')}))
15351549

15361550
def test_invalid_text(self):
15371551
elem = ET.Element('tag')
15381552
elem.text = '\x00'
15391553
self.check(elem)
1554+
elem.text = '\ud8ff'
1555+
self.check(elem)
15401556

15411557
def test_invalid_tail(self):
15421558
elem = ET.Element('tag')
15431559
elem.tail = '\x00'
15441560
self.check(elem)
1561+
elem.tail = '\ud8ff'
1562+
self.check(elem)
15451563

15461564
def test_invalid_text_without_tag(self):
15471565
elem = ET.Element(None)
15481566
elem.text = '\x00'
15491567
self.check(elem)
1568+
elem.text = '\ud8ff'
1569+
self.check(elem)
15501570

15511571
def test_invalid_subelements(self):
15521572
elem = ET.Element('tag')
@@ -1558,7 +1578,9 @@ def test_invalid_subelements(self):
15581578

15591579
def test_invalid_namespace_uri(self):
15601580
self.check(ET.Element('{\x00}tag'))
1581+
self.check(ET.Element('{\ud8ff}tag'))
15611582
self.check(ET.Element(ET.QName('\x00', 'tag')))
1583+
self.check(ET.Element(ET.QName('\ud8ff', 'tag')))
15621584

15631585
@support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes"))
15641586
def test_invalid_cdata_content(self, tag):
@@ -1571,6 +1593,8 @@ def test_invalid_cdata_content(self, tag):
15711593
self.check(elem)
15721594
elem.text = 'a\x00b'
15731595
self.check(elem)
1596+
elem.text = 'a\ud8ffb'
1597+
self.check(elem)
15741598

15751599
@support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes"))
15761600
def test_cdata_subelements(self, tag):
@@ -1582,6 +1606,8 @@ def test_invalid_plaintext_content(self):
15821606
elem = ET.Element('plaintext')
15831607
elem.text = 'a\x00b'
15841608
self.check(elem)
1609+
elem.text = 'a\ud8ffb'
1610+
self.check(elem)
15851611

15861612

15871613
class IterparseTest(unittest.TestCase):

Lib/xml/etree/ElementTree.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -878,7 +878,7 @@ def _serialize_xml(write, elem, qnames, namespaces, *,
878878
text = elem.text
879879
if tag is Comment:
880880
if validate:
881-
if '--' in text or text.endswith('-'):
881+
if '--' in text or text.endswith('-') or not is_valid_text(text):
882882
raise ValueError('invalid comment')
883883
write("<!--%s-->" % text)
884884
elif tag is ProcessingInstruction:
@@ -955,33 +955,36 @@ def _serialize_xml(write, elem, qnames, namespaces, *,
955955
"img", "input", "isindex", "link", "meta", "param", "source",
956956
"track", "wbr", "plaintext"}
957957

958+
def _is_valid_html_text(text):
959+
return re.search('[\x00\ud800-\udfff]', text) is None
960+
958961
def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs):
959962
tag = elem.tag
960963
text = elem.text
961964
if tag is Comment:
962965
if validate:
963966
if (re.prefixmatch('-?>', text) or re.search('--!?>', text)
964-
or '\0' in text):
967+
or not _is_valid_html_text(text)):
965968
raise ValueError('invalid comment')
966969
write("<!--%s-->" % text)
967970
elif tag is ProcessingInstruction:
968971
if validate:
969-
if '>' in text or '\0' in text:
972+
if '>' in text or not _is_valid_html_text(text):
970973
raise ValueError(f'invalid processing instruction {text!r}')
971974
write("<?%s?>" % text)
972975
else:
973976
tag = qnames[tag]
974977
if tag is None:
975978
if text:
976979
if validate:
977-
if '\0' in text:
980+
if not _is_valid_html_text(text):
978981
raise ValueError('invalid characters')
979982
write(_escape_cdata(text))
980983
for e in elem:
981984
_serialize_html(write, e, qnames, None, validate=validate)
982985
else:
983986
if validate:
984-
if not re.fullmatch('[A-Za-z][^\0\t\n\r\f />]*+', tag):
987+
if not re.fullmatch('[A-Za-z][^\0\t\n\r\f />\ud800-\udfff]*+', tag):
985988
raise ValueError(f'invalid element name {tag!r}')
986989
write("<" + tag)
987990
items = list(elem.items())
@@ -992,10 +995,10 @@ def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs)
992995
if k:
993996
k = ":" + k
994997
if validate:
995-
if not re.fullmatch('[^\0\t\n\r\f />=]++', k):
996-
raise ValueError(f'invalid attribute name {k!r}')
998+
if not re.fullmatch('[^\0\t\n\r\f />=\ud800-\udfff]++', k):
999+
raise ValueError(f'invalid namespace name {k[1:]!r}')
9971000
if validate:
998-
if '\0' in v:
1001+
if not _is_valid_html_text(v):
9991002
raise ValueError('invalid characters')
10001003
write(" xmlns%s=\"%s\"" % (
10011004
k,
@@ -1006,27 +1009,27 @@ def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs)
10061009
k = k.text
10071010
k = qnames[k]
10081011
if validate:
1009-
if not re.fullmatch('[^\0\t\n\r\f />][^\0\t\n\r\f />=]*+', k):
1012+
if not re.fullmatch('[^\0\t\n\r\f />\ud800-\udfff][^\0\t\n\r\f />=\ud800-\udfff]*+', k):
10101013
raise ValueError(f'invalid attribute name {k!r}')
10111014
if v is None:
10121015
write(" %s" % k) # empty attr
10131016
else:
10141017
if isinstance(v, QName):
10151018
v = qnames[v.text]
10161019
if validate:
1017-
if '\0' in v or '"' in v or '&' in v:
1020+
if re.search('[\0"&\ud800-\udfff]', v):
10181021
raise ValueError(f'invalid attribute value {v!r}')
10191022
else:
10201023
if validate:
1021-
if '\0' in v:
1024+
if not _is_valid_html_text(v):
10221025
raise ValueError(f'invalid attribute value {v!r}')
10231026
v = _escape_attrib_html(v)
10241027
write(" %s=\"%s\"" % (k, v))
10251028
write(">")
10261029
ltag = tag.lower()
10271030
if text:
10281031
if validate:
1029-
if '\0' in text:
1032+
if not _is_valid_html_text(text):
10301033
raise ValueError('invalid characters')
10311034
if ltag in _CDATA_CONTENT_ELEMENTS:
10321035
if validate:
@@ -1046,7 +1049,7 @@ def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs)
10461049
write("</" + tag + ">")
10471050
if elem.tail:
10481051
if validate:
1049-
if '\0' in elem.tail:
1052+
if not _is_valid_html_text(elem.tail):
10501053
raise ValueError('invalid characters')
10511054
write(_escape_cdata(elem.tail))
10521055

0 commit comments

Comments
 (0)