1  """A cleanup tool for HTML. 
  2   
  3  Removes unwanted tags and content.  See the `Cleaner` class for 
  4  details. 
  5  """ 
  6   
  7  import re 
  8  import copy 
  9  try: 
 10      from urlparse import urlsplit 
 11  except ImportError: 
 12       
 13      from urllib.parse import urlsplit 
 14  from lxml import etree 
 15  from lxml.html import defs 
 16  from lxml.html import fromstring, tostring, XHTML_NAMESPACE 
 17  from lxml.html import xhtml_to_html, _transform_result 
 18   
 19  try: 
 20      unichr 
 21  except NameError: 
 22       
 23      unichr = chr 
 24  try: 
 25      unicode 
 26  except NameError: 
 27       
 28      unicode = str 
 29  try: 
 30      bytes 
 31  except NameError: 
 32       
 33      bytes = str 
 34  try: 
 35      basestring 
 36  except NameError: 
 37      basestring = (str, bytes) 
 38   
 39   
 40  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 41             'word_break', 'word_break_html'] 
 42   
 43   
 44   
 45   
 46   
 47   
 48   
 49   
 50   
 51   
 52   
 53   
 54   
 55   
 56   
 57   
 58   
 59   
 60   
 61   
 62   
 63   
 64  _css_javascript_re = re.compile( 
 65      r'expression\s*\(.*?\)', re.S|re.I) 
 66   
 67   
 68  _css_import_re = re.compile( 
 69      r'@\s*import', re.I) 
 70   
 71   
 72   
 73  _is_image_dataurl = re.compile( 
 74      r'^data:image/.+;base64', re.I).search 
 75  _is_possibly_malicious_scheme = re.compile( 
 76      r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):', 
 77      re.I).search 
 82   
 83  _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub 
 84   
 85   
 86   
 87  _conditional_comment_re = re.compile( 
 88      r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 
 89   
 90  _find_styled_elements = etree.XPath( 
 91      "descendant-or-self::*[@style]") 
 92   
 93  _find_external_links = etree.XPath( 
 94      ("descendant-or-self::a  [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" 
 95       "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), 
 96      namespaces={'x':XHTML_NAMESPACE}) 
 97   
 99      """ 
100      Instances cleans the document of each of the possible offending 
101      elements.  The cleaning is controlled by attributes; you can 
102      override attributes in a subclass, or set them in the constructor. 
103   
104      ``scripts``: 
105          Removes any ``<script>`` tags. 
106   
107      ``javascript``: 
108          Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets 
109          as they could contain Javascript. 
110   
111      ``comments``: 
112          Removes any comments. 
113   
114      ``style``: 
115          Removes any style tags or attributes. 
116   
117      ``links``: 
118          Removes any ``<link>`` tags 
119   
120      ``meta``: 
121          Removes any ``<meta>`` tags 
122   
123      ``page_structure``: 
124          Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 
125   
126      ``processing_instructions``: 
127          Removes any processing instructions. 
128   
129      ``embedded``: 
130          Removes any embedded objects (flash, iframes) 
131   
132      ``frames``: 
133          Removes any frame-related tags 
134   
135      ``forms``: 
136          Removes any form tags 
137   
138      ``annoying_tags``: 
139          Tags that aren't *wrong*, but are annoying.  ``<blink>`` and ``<marquee>`` 
140   
141      ``remove_tags``: 
142          A list of tags to remove.  Only the tags will be removed, 
143          their content will get pulled up into the parent tag. 
144   
145      ``kill_tags``: 
146          A list of tags to kill.  Killing also removes the tag's content, 
147          i.e. the whole subtree, not just the tag itself. 
148   
149      ``allow_tags``: 
150          A list of tags to include (default include all). 
151   
152      ``remove_unknown_tags``: 
153          Remove any tags that aren't standard parts of HTML. 
154   
155      ``safe_attrs_only``: 
156          If true, only include 'safe' attributes (specifically the list 
157          from the feedparser HTML sanitisation web site). 
158   
159      ``safe_attrs``: 
160          A set of attribute names to override the default list of attributes 
161          considered 'safe' (when safe_attrs_only=True). 
162   
163      ``add_nofollow``: 
164          If true, then any <a> tags will have ``rel="nofollow"`` added to them. 
165   
166      ``host_whitelist``: 
167          A list or set of hosts that you can use for embedded content 
168          (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 
169          You can also implement/override the method 
170          ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 
171          implement more complex rules for what can be embedded. 
172          Anything that passes this test will be shown, regardless of 
173          the value of (for instance) ``embedded``. 
174   
175          Note that this parameter might not work as intended if you do not 
176          make the links absolute before doing the cleaning. 
177   
178          Note that you may also need to set ``whitelist_tags``. 
179   
180      ``whitelist_tags``: 
181          A set of tags that can be included with ``host_whitelist``. 
182          The default is ``iframe`` and ``embed``; you may wish to 
183          include other tags like ``script``, or you may want to 
184          implement ``allow_embedded_url`` for more control.  Set to None to 
185          include all tags. 
186   
187      This modifies the document *in place*. 
188      """ 
189   
190      scripts = True 
191      javascript = True 
192      comments = True 
193      style = False 
194      links = True 
195      meta = True 
196      page_structure = True 
197      processing_instructions = True 
198      embedded = True 
199      frames = True 
200      forms = True 
201      annoying_tags = True 
202      remove_tags = None 
203      allow_tags = None 
204      kill_tags = None 
205      remove_unknown_tags = True 
206      safe_attrs_only = True 
207      safe_attrs = defs.safe_attrs 
208      add_nofollow = False 
209      host_whitelist = () 
210      whitelist_tags = set(['iframe', 'embed']) 
211   
218   
219       
220       
221      _tag_link_attrs = dict( 
222          script='src', 
223          link='href', 
224           
225           
226          applet=['code', 'object'], 
227          iframe='src', 
228          embed='src', 
229          layer='src', 
230           
231           
232           
233           
234           
235           
236           
237           
238          a='href', 
239          ) 
240   
242          """ 
243          Cleans the document. 
244          """ 
245          if hasattr(doc, 'getroot'): 
246               
247              doc = doc.getroot() 
248           
249          xhtml_to_html(doc) 
250           
251           
252          for el in doc.iter('image'): 
253              el.tag = 'img' 
254          if not self.comments: 
255               
256               
257              self.kill_conditional_comments(doc) 
258   
259          kill_tags = set(self.kill_tags or ()) 
260          remove_tags = set(self.remove_tags or ()) 
261          allow_tags = set(self.allow_tags or ()) 
262   
263          if self.scripts: 
264              kill_tags.add('script') 
265          if self.safe_attrs_only: 
266              safe_attrs = set(self.safe_attrs) 
267              for el in doc.iter(etree.Element): 
268                  attrib = el.attrib 
269                  for aname in attrib.keys(): 
270                      if aname not in safe_attrs: 
271                          del attrib[aname] 
272          if self.javascript: 
273              if not (self.safe_attrs_only and 
274                      self.safe_attrs == defs.safe_attrs): 
275                   
276                  for el in doc.iter(etree.Element): 
277                      attrib = el.attrib 
278                      for aname in attrib.keys(): 
279                          if aname.startswith('on'): 
280                              del attrib[aname] 
281              doc.rewrite_links(self._remove_javascript_link, 
282                                resolve_base_href=False) 
283              if not self.style: 
284                   
285                   
286                  for el in _find_styled_elements(doc): 
287                      old = el.get('style') 
288                      new = _css_javascript_re.sub('', old) 
289                      new = _css_import_re.sub('', new) 
290                      if self._has_sneaky_javascript(new): 
291                           
292                          del el.attrib['style'] 
293                      elif new != old: 
294                          el.set('style', new) 
295                  for el in list(doc.iter('style')): 
296                      if el.get('type', '').lower().strip() == 'text/javascript': 
297                          el.drop_tree() 
298                          continue 
299                      old = el.text or '' 
300                      new = _css_javascript_re.sub('', old) 
301                       
302                      new = _css_import_re.sub('', old) 
303                      if self._has_sneaky_javascript(new): 
304                           
305                          el.text = '/* deleted */' 
306                      elif new != old: 
307                          el.text = new 
308          if self.comments or self.processing_instructions: 
309               
310               
311               
312              kill_tags.add(etree.Comment) 
313          if self.processing_instructions: 
314              kill_tags.add(etree.ProcessingInstruction) 
315          if self.style: 
316              kill_tags.add('style') 
317              etree.strip_attributes(doc, 'style') 
318          if self.links: 
319              kill_tags.add('link') 
320          elif self.style or self.javascript: 
321               
322               
323              for el in list(doc.iter('link')): 
324                  if 'stylesheet' in el.get('rel', '').lower(): 
325                       
326                      if not self.allow_element(el): 
327                          el.drop_tree() 
328          if self.meta: 
329              kill_tags.add('meta') 
330          if self.page_structure: 
331              remove_tags.update(('head', 'html', 'title')) 
332          if self.embedded: 
333               
334               
335               
336              for el in list(doc.iter('param')): 
337                  found_parent = False 
338                  parent = el.getparent() 
339                  while parent is not None and parent.tag not in ('applet', 'object'): 
340                      parent = parent.getparent() 
341                  if parent is None: 
342                      el.drop_tree() 
343              kill_tags.update(('applet',)) 
344               
345              remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 
346          if self.frames: 
347               
348               
349               
350              kill_tags.update(defs.frame_tags) 
351          if self.forms: 
352              remove_tags.add('form') 
353              kill_tags.update(('button', 'input', 'select', 'textarea')) 
354          if self.annoying_tags: 
355              remove_tags.update(('blink', 'marquee')) 
356   
357          _remove = [] 
358          _kill = [] 
359          for el in doc.iter(): 
360              if el.tag in kill_tags: 
361                  if self.allow_element(el): 
362                      continue 
363                  _kill.append(el) 
364              elif el.tag in remove_tags: 
365                  if self.allow_element(el): 
366                      continue 
367                  _remove.append(el) 
368   
369          if _remove and _remove[0] == doc: 
370               
371               
372              el = _remove.pop(0) 
373              el.tag = 'div' 
374              el.attrib.clear() 
375          elif _kill and _kill[0] == doc: 
376               
377               
378              el = _kill.pop(0) 
379              if el.tag != 'html': 
380                  el.tag = 'div' 
381              el.clear() 
382   
383          _kill.reverse()  
384          for el in _kill: 
385              el.drop_tree() 
386          for el in _remove: 
387              el.drop_tag() 
388   
389          if self.remove_unknown_tags: 
390              if allow_tags: 
391                  raise ValueError( 
392                      "It does not make sense to pass in both allow_tags and remove_unknown_tags") 
393              allow_tags = set(defs.tags) 
394          if allow_tags: 
395              bad = [] 
396              for el in doc.iter(): 
397                  if el.tag not in allow_tags: 
398                      bad.append(el) 
399              if bad: 
400                  if bad[0] is doc: 
401                      el = bad.pop(0) 
402                      el.tag = 'div' 
403                      el.attrib.clear() 
404                  for el in bad: 
405                      el.drop_tag() 
406          if self.add_nofollow: 
407              for el in _find_external_links(doc): 
408                  if not self.allow_follow(el): 
409                      rel = el.get('rel') 
410                      if rel: 
411                          if ('nofollow' in rel 
412                                  and ' nofollow ' in (' %s ' % rel)): 
413                              continue 
414                          rel = '%s nofollow' % rel 
415                      else: 
416                          rel = 'nofollow' 
417                      el.set('rel', rel) 
 418   
420          """ 
421          Override to suppress rel="nofollow" on some anchors. 
422          """ 
423          return False 
 424   
426          if el.tag not in self._tag_link_attrs: 
427              return False 
428          attr = self._tag_link_attrs[el.tag] 
429          if isinstance(attr, (list, tuple)): 
430              for one_attr in attr: 
431                  url = el.get(one_attr) 
432                  if not url: 
433                      return False 
434                  if not self.allow_embedded_url(el, url): 
435                      return False 
436              return True 
437          else: 
438              url = el.get(attr) 
439              if not url: 
440                  return False 
441              return self.allow_embedded_url(el, url) 
 442   
444          if (self.whitelist_tags is not None 
445              and el.tag not in self.whitelist_tags): 
446              return False 
447          scheme, netloc, path, query, fragment = urlsplit(url) 
448          netloc = netloc.lower().split(':', 1)[0] 
449          if scheme not in ('http', 'https'): 
450              return False 
451          if netloc in self.host_whitelist: 
452              return True 
453          return False 
 454   
465   
467          bad = [] 
468          for el in doc.iter(iterate): 
469              if condition(el): 
470                  bad.append(el) 
471          for el in bad: 
472              el.drop_tree() 
 473   
481   
482      _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 
483   
485          """ 
486          Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 
487          can get interpreted, or ``expre/* stuff */ssion(...)``.  This 
488          checks for attempt to do stuff like this. 
489   
490          Typically the response will be to kill the entire style; if you 
491          have just a bit of Javascript in the style another rule will catch 
492          that and remove only the Javascript from the style; this catches 
493          more sneaky attempts. 
494          """ 
495          style = self._substitute_comments('', style) 
496          style = style.replace('\\', '') 
497          style = _substitute_whitespace('', style) 
498          style = style.lower() 
499          if 'javascript:' in style: 
500              return True 
501          if 'expression(' in style: 
502              return True 
503          return False 
 504   
 513   
514  clean = Cleaner() 
515  clean_html = clean.clean_html 
516   
517   
518   
519   
520   
521  _link_regexes = [ 
522      re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), 
523       
524      re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I), 
525      ] 
526   
527  _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 
528   
529  _avoid_hosts = [ 
530      re.compile(r'^localhost', re.I), 
531      re.compile(r'\bexample\.(?:com|org|net)$', re.I), 
532      re.compile(r'^127\.0\.0\.1$'), 
533      ] 
534   
535  _avoid_classes = ['nolink'] 
536   
541      """ 
542      Turn any URLs into links. 
543   
544      It will search for links identified by the given regular 
545      expressions (by default mailto and http(s) links). 
546   
547      It won't link text in an element in avoid_elements, or an element 
548      with a class in avoid_classes.  It won't link to anything with a 
549      host that matches one of the regular expressions in avoid_hosts 
550      (default localhost and 127.0.0.1). 
551   
552      If you pass in an element, the element's tail will not be 
553      substituted, only the contents of the element. 
554      """ 
555      if el.tag in avoid_elements: 
556          return 
557      class_name = el.get('class') 
558      if class_name: 
559          class_name = class_name.split() 
560          for match_class in avoid_classes: 
561              if match_class in class_name: 
562                  return 
563      for child in list(el): 
564          autolink(child, link_regexes=link_regexes, 
565                   avoid_elements=avoid_elements, 
566                   avoid_hosts=avoid_hosts, 
567                   avoid_classes=avoid_classes) 
568          if child.tail: 
569              text, tail_children = _link_text( 
570                  child.tail, link_regexes, avoid_hosts, factory=el.makeelement) 
571              if tail_children: 
572                  child.tail = text 
573                  index = el.index(child) 
574                  el[index+1:index+1] = tail_children 
575      if el.text: 
576          text, pre_children = _link_text( 
577              el.text, link_regexes, avoid_hosts, factory=el.makeelement) 
578          if pre_children: 
579              el.text = text 
580              el[:0] = pre_children 
 581   
582 -def _link_text(text, link_regexes, avoid_hosts, factory): 
 583      leading_text = '' 
584      links = [] 
585      last_pos = 0 
586      while 1: 
587          best_match, best_pos = None, None 
588          for regex in link_regexes: 
589              regex_pos = last_pos 
590              while 1: 
591                  match = regex.search(text, pos=regex_pos) 
592                  if match is None: 
593                      break 
594                  host = match.group('host') 
595                  for host_regex in avoid_hosts: 
596                      if host_regex.search(host): 
597                          regex_pos = match.end() 
598                          break 
599                  else: 
600                      break 
601              if match is None: 
602                  continue 
603              if best_pos is None or match.start() < best_pos: 
604                  best_match = match 
605                  best_pos = match.start() 
606          if best_match is None: 
607               
608              if links: 
609                  assert not links[-1].tail 
610                  links[-1].tail = text 
611              else: 
612                  assert not leading_text 
613                  leading_text = text 
614              break 
615          link = best_match.group(0) 
616          end = best_match.end() 
617          if link.endswith('.') or link.endswith(','): 
618               
619              end -= 1 
620              link = link[:-1] 
621          prev_text = text[:best_match.start()] 
622          if links: 
623              assert not links[-1].tail 
624              links[-1].tail = prev_text 
625          else: 
626              assert not leading_text 
627              leading_text = prev_text 
628          anchor = factory('a') 
629          anchor.set('href', link) 
630          body = best_match.group('body') 
631          if not body: 
632              body = link 
633          if body.endswith('.') or body.endswith(','): 
634              body = body[:-1] 
635          anchor.text = body 
636          links.append(anchor) 
637          text = text[end:] 
638      return leading_text, links 
 639                   
648   
649  autolink_html.__doc__ = autolink.__doc__ 
650   
651   
652   
653   
654   
655  _avoid_word_break_elements = ['pre', 'textarea', 'code'] 
656  _avoid_word_break_classes = ['nobreak'] 
657   
662      """ 
663      Breaks any long words found in the body of the text (not attributes). 
664   
665      Doesn't effect any of the tags in avoid_elements, by default 
666      ``<textarea>`` and ``<pre>`` 
667   
668      Breaks words by inserting ​, which is a unicode character 
669      for Zero Width Space character.  This generally takes up no space 
670      in rendering, but does copy as a space, and in monospace contexts 
671      usually takes up space. 
672   
673      See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 
674      """ 
675       
676       
677      if el.tag in _avoid_word_break_elements: 
678          return 
679      class_name = el.get('class') 
680      if class_name: 
681          dont_break = False 
682          class_name = class_name.split() 
683          for avoid in avoid_classes: 
684              if avoid in class_name: 
685                  dont_break = True 
686                  break 
687          if dont_break: 
688              return 
689      if el.text: 
690          el.text = _break_text(el.text, max_width, break_character) 
691      for child in el: 
692          word_break(child, max_width=max_width, 
693                     avoid_elements=avoid_elements, 
694                     avoid_classes=avoid_classes, 
695                     break_character=break_character) 
696          if child.tail: 
697              child.tail = _break_text(child.tail, max_width, break_character) 
 698   
704   
705 -def _break_text(text, max_width, break_character): 
 706      words = text.split() 
707      for word in words: 
708          if len(word) > max_width: 
709              replacement = _insert_break(word, max_width, break_character) 
710              text = text.replace(word, replacement) 
711      return text 
 712   
713  _break_prefer_re = re.compile(r'[^a-z]', re.I) 
714   
716      orig_word = word 
717      result = '' 
718      while len(word) > width: 
719          start = word[:width] 
720          breaks = list(_break_prefer_re.finditer(start)) 
721          if breaks: 
722              last_break = breaks[-1] 
723               
724              if last_break.end() > width-10: 
725                   
726                   
727                  start = word[:last_break.end()] 
728          result += start + break_character 
729          word = word[len(start):] 
730      result += word 
731      return result 
 732