1   
  2   
  3  """ 
  4  HTML parser test cases for etree 
  5  """ 
  6   
  7  import unittest 
  8  import tempfile, os, os.path, sys 
  9   
 10  this_dir = os.path.dirname(__file__) 
 11  if this_dir not in sys.path: 
 12      sys.path.insert(0, this_dir)  
 13   
 14  from common_imports import etree, html, StringIO, BytesIO, fileInTestDir, _bytes, _str 
 15  from common_imports import SillyFileLike, HelperTestCase, write_to_file, next 
 16   
 17  try: 
 18      unicode 
 19  except NameError: 
 20      unicode = str 
 21   
 22   
 24      """HTML parser test cases 
 25      """ 
 26      etree = etree 
 27   
 28      html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>") 
 29      html_str_pretty = _bytes("""\ 
 30  <html> 
 31  <head><title>test</title></head> 
 32  <body><h1>page title</h1></body> 
 33  </html> 
 34  """) 
 35      broken_html_str = _bytes("<html><head><title>test" 
 36                               "<body><h1>page title</h3></p></html>") 
 37      uhtml_str = _bytes( 
 38          "<html><head><title>test á</title></head>" 
 39          "<body><h1>page á title</h1></body></html>").decode('utf8') 
 40   
 44   
 49   
 57   
 59          if sys.maxunicode < 1114111: 
 60              return   
 61          element = self.etree.HTML(_bytes( 
 62              '<html><body><p>\\U00026007</p></body></html>' 
 63          ).decode('unicode_escape')) 
 64          p_text = element.findtext('.//p') 
 65          self.assertEqual(1, len(p_text)) 
 66          self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'), 
 67                           p_text) 
  68   
 73   
 80   
 82          parser = self.etree.HTMLParser() 
 83          Element = parser.makeelement 
 84   
 85          el = Element('name') 
 86          self.assertRaises(ValueError, Element, '{}') 
 87          self.assertRaises(ValueError, setattr, el, 'tag', '{}') 
 88   
 89          self.assertRaises(ValueError, Element, '{test}') 
 90          self.assertRaises(ValueError, setattr, el, 'tag', '{test}') 
  91   
105   
107          parser = self.etree.HTMLParser() 
108          Element = parser.makeelement 
109   
110          self.assertRaises(ValueError, Element, 'p"name') 
111          self.assertRaises(ValueError, Element, "na'me") 
112          self.assertRaises(ValueError, Element, '{test}"name') 
113          self.assertRaises(ValueError, Element, "{test}name'") 
114   
115          el = Element('name') 
116          self.assertRaises(ValueError, setattr, el, 'tag', "pname'") 
117          self.assertRaises(ValueError, setattr, el, 'tag', '"pname') 
118          self.assertEqual(el.tag, "name") 
 119   
121          parser = self.etree.HTMLParser() 
122          Element = parser.makeelement 
123   
124          self.assertRaises(ValueError, Element, ' name ') 
125          self.assertRaises(ValueError, Element, 'na me') 
126          self.assertRaises(ValueError, Element, '{test} name') 
127   
128          el = Element('name') 
129          self.assertRaises(ValueError, setattr, el, 'tag', ' name ') 
130          self.assertEqual(el.tag, "name") 
 131   
141   
153   
155          parser = self.etree.HTMLParser() 
156          Element = parser.makeelement 
157          SubElement = self.etree.SubElement 
158   
159          el = Element('name') 
160          self.assertRaises(ValueError, SubElement, el, "name'") 
161          self.assertRaises(ValueError, SubElement, el, 'na"me') 
162          self.assertRaises(ValueError, SubElement, el, "{test}na'me") 
163          self.assertRaises(ValueError, SubElement, el, '{test}"name') 
 164   
174   
181   
191   
193          text = _str('Søk på nettet') 
194          wrong_head = _str(''' 
195          <head> 
196            <meta http-equiv="Content-Type" 
197                  content="text/html; charset=UTF-8" /> 
198          </head>''') 
199          html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head, 
200                                                                          text) 
201                        ).encode('iso-8859-1') 
202   
203          self.assertRaises(self.etree.ParseError, 
204                            self.etree.parse, 
205                            BytesIO(html_latin1)) 
206   
207          tree = self.etree.parse( 
208              BytesIO(html_latin1), 
209              self.etree.HTMLParser(encoding="iso-8859-1")) 
210          p = tree.find("//p") 
211          self.assertEqual(p.text, text) 
 212   
217   
223   
227   
240   
248   
249   
250   
251   
252   
253   
254   
255   
262   
277   
279          iterparse = self.etree.iterparse 
280          f = BytesIO( 
281              '<html><head><title>TITLE</title><body><p>P</p></body></html>') 
282   
283          iterator = iterparse(f, html=True) 
284          self.assertEqual(None, iterator.root) 
285   
286          events = list(iterator) 
287          root = iterator.root 
288          self.assertTrue(root is not None) 
289          self.assertEqual( 
290              [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]), 
291               ('end', root[1]), ('end', root)], 
292              events) 
 293   
295          iterparse = self.etree.iterparse 
296          f = BytesIO( 
297              '<html><head><title>TITLE</title><body><p>P</p></body></html>') 
298   
299          iterator = iterparse(f, html=True) 
300          self.assertEqual(None, iterator.root) 
301   
302          event, element = next(iterator) 
303          self.assertEqual('end', event) 
304          self.assertEqual('title', element.tag) 
305          self.assertEqual(None, iterator.root) 
306          del element 
307   
308          event, element = next(iterator) 
309          self.assertEqual('end', event) 
310          self.assertEqual('head', element.tag) 
311          self.assertEqual(None, iterator.root) 
312          del element 
313          del iterator 
 314   
316          iterparse = self.etree.iterparse 
317          f = BytesIO('<head><title>TEST></head><p>P<br></div>') 
318   
319          iterator = iterparse(f, html=True) 
320          self.assertEqual(None, iterator.root) 
321   
322          events = list(iterator) 
323          root = iterator.root 
324          self.assertTrue(root is not None) 
325          self.assertEqual('html', root.tag) 
326          self.assertEqual('head', root[0].tag) 
327          self.assertEqual('body', root[1].tag) 
328          self.assertEqual('p', root[1][0].tag) 
329          self.assertEqual('br', root[1][0][0].tag) 
330          self.assertEqual( 
331              [('end', root[0][0]), ('end', root[0]), ('end', root[1][0][0]), 
332               ('end', root[1][0]), ('end', root[1]), ('end', root)], 
333              events) 
 334   
340   
353   
355          iterparse = self.etree.iterparse 
356          f = BytesIO( 
357              '<html><head><title>TITLE</title><body><p>P</p></body></html>') 
358   
359          iterator = iterparse(f, html=True, events=('start',)) 
360          self.assertEqual(None, iterator.root) 
361   
362          events = list(iterator) 
363          root = iterator.root 
364          self.assertNotEqual(None, root) 
365          self.assertEqual( 
366              [('start', root), ('start', root[0]), ('start', root[0][0]), 
367                  ('start', root[1]), ('start', root[1][0])], 
368              events) 
 369   
380   
399   
414   
422              def end(self, tag): 
423                  events.append(("end", tag)) 
 424              def close(self): 
425                  return "DONE" 
426   
427          parser = self.etree.HTMLParser(target=Target()) 
428   
429          parser.feed("<html><body></body></html>") 
430          done = parser.close() 
431   
432          self.assertEqual("DONE", done) 
433          self.assertEqual([ 
434              ("start", "html"), ("start", "body"), 
435              ("end", "body"), ("end", "html")], events) 
436   
444              def end(self, tag): 
445                  events.append(("end", tag)) 
446              def doctype(self, *args): 
447                  events.append(("doctype", args)) 
448              def close(self): 
449                  return "DONE" 
450   
451          parser = self.etree.HTMLParser(target=Target()) 
452          parser.feed("<!DOCTYPE><html><body></body></html>") 
453          done = parser.close() 
454   
455          self.assertEqual("DONE", done) 
456          self.assertEqual([ 
457              ("doctype", (None, None, None)), 
458              ("start", "html"), ("start", "body"), 
459              ("end", "body"), ("end", "html")], events) 
460   
468              def end(self, tag): 
469                  events.append(("end", tag)) 
470              def doctype(self, *args): 
471                  events.append(("doctype", args)) 
472              def close(self): 
473                  return "DONE" 
474   
475          parser = self.etree.HTMLParser(target=Target()) 
476          parser.feed("<!DOCTYPE html><html><body></body></html>") 
477          done = parser.close() 
478   
479          self.assertEqual("DONE", done) 
480          self.assertEqual([ 
481              ("doctype", ("html", None, None)), 
482              ("start", "html"), ("start", "body"), 
483              ("end", "body"), ("end", "html")], events) 
484   
492              def end(self, tag): 
493                  events.append(("end", tag)) 
494              def doctype(self, *args): 
495                  events.append(("doctype", args)) 
496              def close(self): 
497                  return "DONE" 
498   
499          parser = self.etree.HTMLParser(target=Target()) 
500          parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">' 
501                      '<html><body></body></html>') 
502          done = parser.close() 
503   
504          self.assertEqual("DONE", done) 
505          self.assertEqual([ 
506              ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")), 
507              ("start", "html"), ("start", "body"), 
508              ("end", "body"), ("end", "html")], events) 
509   
511          events = [] 
512          class Target(object): 
513              def start(self, tag, attrib): 
514                  events.append(("start", tag)) 
515                  raise ValueError("START") 
 516              def end(self, tag): 
517                  events.append(("end", tag)) 
518                  raise TypeError("END") 
519              def close(self): 
520                  return "DONE" 
521   
522          parser = self.etree.HTMLParser(target=Target()) 
523          try: 
524              parser.feed('<html><body>') 
525              parser.feed('</body></html>') 
526          except ValueError as exc: 
527              assert "START" in str(exc) 
528          except TypeError as exc: 
529              assert "END" in str(exc) 
530              self.assertTrue(False, "wrong exception raised") 
531          else: 
532              self.assertTrue(False, "no exception raised") 
533   
534          self.assertTrue(("start", "html") in events, events) 
535          self.assertTrue(("end", "html") not in events, events) 
536   
538          events = [] 
539          class Target(object): 
540              def start(self, tag, attrib): 
541                  events.append(("start", tag)) 
542                  raise ValueError("START") 
 543              def end(self, tag): 
544                  events.append(("end", tag)) 
545                  raise TypeError("END") 
546              def close(self): 
547                  return "DONE" 
548   
549          parser = self.etree.HTMLParser(target=Target()) 
550          try: 
551              self.etree.fromstring('<html><body></body></html>', parser) 
552          except ValueError as exc: 
553              assert "START" in str(exc), str(exc) 
554          except TypeError as exc: 
555              assert "END" in str(exc), str(exc) 
556              self.assertTrue(False, "wrong exception raised") 
557          else: 
558              self.assertTrue(False, "no exception raised") 
559   
560          self.assertTrue(("start", "html") in events, events) 
561          self.assertTrue(("end", "html") not in events, events) 
562   
564          doc = html.Element('html').getroottree() 
565          doc.docinfo.public_id = "-//W3C//DTD XHTML 1.0 Strict//EN" 
566          doc.docinfo.system_url = \ 
567              "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" 
568          self.assertEqual(doc.docinfo.doctype, 
569                           '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">') 
570          self.assertEqual(self.etree.tostring(doc), 
571                           _bytes('''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 
572  <html xmlns="http://www.w3.org/1999/xhtml"></html>''')) 
 573   
584   
594   
595   
597      suite = unittest.TestSuite() 
598      suite.addTests([unittest.makeSuite(HtmlParserTestCase)]) 
599      return suite 
 600   
601   
602  if __name__ == '__main__': 
603      print('to test use test.py %s' % __file__) 
604