# -*- coding: utf-8 -*- """ HTML parser test cases for etree """ import unittest import tempfile, os, os.path, sys this_dir = os.path.dirname(__file__) if this_dir not in sys.path: sys.path.insert(0, this_dir) # needed for Py3 from common_imports import etree, html, StringIO, BytesIO, fileInTestDir, _bytes, _str from common_imports import SillyFileLike, HelperTestCase, write_to_file, next try: unicode except NameError: unicode = str class HtmlParserTestCase(HelperTestCase): """HTML parser test cases """ etree = etree html_str = _bytes("
\\U00026007
' ).decode('unicode_escape')) p_text = element.findtext('.//p') self.assertEqual(1, len(p_text)) self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'), p_text) def test_html_ids(self): parser = self.etree.HTMLParser(recover=False) fromstring = self.etree.fromstring html = fromstring(''' ''', parser=parser) self.assertEqual(len(html.xpath('//p[@id="pID"]')), 1) def test_html_ids_no_collect_ids(self): parser = self.etree.HTMLParser(recover=False, collect_ids=False) fromstring = self.etree.fromstring html = fromstring(''' ''', parser=parser) self.assertEqual(len(html.xpath('//p[@id="pID"]')), 1) def test_module_HTML_pretty_print(self): element = self.etree.HTML(self.html_str) self.assertEqual(self.etree.tostring(element, method="html", pretty_print=True), self.html_str_pretty) def test_module_parse_html_error(self): parser = self.etree.HTMLParser(recover=False) parse = self.etree.parse f = BytesIO("") self.assertRaises(self.etree.XMLSyntaxError, parse, f, parser) def test_html_element_name_empty(self): parser = self.etree.HTMLParser() Element = parser.makeelement el = Element('name') self.assertRaises(ValueError, Element, '{}') self.assertRaises(ValueError, setattr, el, 'tag', '{}') self.assertRaises(ValueError, Element, '{test}') self.assertRaises(ValueError, setattr, el, 'tag', '{test}') def test_html_element_name_colon(self): parser = self.etree.HTMLParser() Element = parser.makeelement pname = Element('p:name') self.assertEqual(pname.tag, 'p:name') pname = Element('{test}p:name') self.assertEqual(pname.tag, '{test}p:name') pname = Element('name') pname.tag = 'p:name' self.assertEqual(pname.tag, 'p:name') def test_html_element_name_quote(self): parser = self.etree.HTMLParser() Element = parser.makeelement self.assertRaises(ValueError, Element, 'p"name') self.assertRaises(ValueError, Element, "na'me") self.assertRaises(ValueError, Element, '{test}"name') self.assertRaises(ValueError, Element, "{test}name'") el = Element('name') self.assertRaises(ValueError, setattr, el, 'tag', "pname'") self.assertRaises(ValueError, setattr, el, 'tag', '"pname') self.assertEqual(el.tag, "name") def test_html_element_name_space(self): parser = self.etree.HTMLParser() Element = parser.makeelement self.assertRaises(ValueError, Element, ' name ') self.assertRaises(ValueError, Element, 'na me') self.assertRaises(ValueError, Element, '{test} name') el = Element('name') self.assertRaises(ValueError, setattr, el, 'tag', ' name ') self.assertEqual(el.tag, "name") def test_html_subelement_name_empty(self): parser = self.etree.HTMLParser() Element = parser.makeelement SubElement = self.etree.SubElement el = Element('name') self.assertRaises(ValueError, SubElement, el, '{}') self.assertRaises(ValueError, SubElement, el, '{test}') def test_html_subelement_name_colon(self): parser = self.etree.HTMLParser() Element = parser.makeelement SubElement = self.etree.SubElement el = Element('name') pname = SubElement(el, 'p:name') self.assertEqual(pname.tag, 'p:name') pname = SubElement(el, '{test}p:name') self.assertEqual(pname.tag, '{test}p:name') def test_html_subelement_name_quote(self): parser = self.etree.HTMLParser() Element = parser.makeelement SubElement = self.etree.SubElement el = Element('name') self.assertRaises(ValueError, SubElement, el, "name'") self.assertRaises(ValueError, SubElement, el, 'na"me') self.assertRaises(ValueError, SubElement, el, "{test}na'me") self.assertRaises(ValueError, SubElement, el, '{test}"name') def test_html_subelement_name_space(self): parser = self.etree.HTMLParser() Element = parser.makeelement SubElement = self.etree.SubElement el = Element('name') self.assertRaises(ValueError, SubElement, el, ' name ') self.assertRaises(ValueError, SubElement, el, 'na me') self.assertRaises(ValueError, SubElement, el, '{test} name') def test_module_parse_html_norecover(self): parser = self.etree.HTMLParser(recover=False) parse = self.etree.parse f = BytesIO(self.broken_html_str) self.assertRaises(self.etree.XMLSyntaxError, parse, f, parser) def test_module_parse_html_default_doctype(self): parser = self.etree.HTMLParser(default_doctype=False) d = html.fromstring('%s
') % text).encode('iso-8859-1') tree = self.etree.parse( BytesIO(html_latin1), self.etree.HTMLParser(encoding="iso-8859-1")) p = tree.find("//p") self.assertEqual(p.text, text) def test_parse_encoding_8bit_override(self): text = _str('Søk på nettet') wrong_head = _str(''' ''') html_latin1 = (_str('%s%s
') % (wrong_head, text) ).encode('iso-8859-1') self.assertRaises(self.etree.ParseError, self.etree.parse, BytesIO(html_latin1)) tree = self.etree.parse( BytesIO(html_latin1), self.etree.HTMLParser(encoding="iso-8859-1")) p = tree.find("//p") self.assertEqual(p.text, text) def test_module_HTML_broken(self): element = self.etree.HTML(self.broken_html_str) self.assertEqual(self.etree.tostring(element, method="html"), self.html_str) def test_module_HTML_cdata(self): # by default, libxml2 generates CDATA nodes for