File: test_clean_embed.txt

package info (click to toggle)
lxml-html-clean 0.4.2-1
  • links: PTS
  • area: main
  • in suites: trixie
  • size: 228 kB
  • sloc: python: 865; makefile: 12
file content (39 lines) | stat: -rw-r--r-- 1,520 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
THIS FAILS IN libxml2 2.6.29 AND 2.6.30 !!


>>> from lxml.html import fromstring, tostring
>>> from lxml_html_clean import clean, clean_html, Cleaner
>>> from lxml.html import usedoctest

>>> def tostring(el):  # work-around for Py3 'bytes' type
...     from lxml.html import tostring
...     s = tostring(el)
...     if not isinstance(s, str):
...         s = s.decode('UTF-8')
...     return s

>>> doc_embed = '''<div>
... <embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
... <embed src="http://anothersite.com/v/another"></embed>
... <script src="http://www.youtube.com/example.js"></script>
... <script src="/something-else.js"></script>
... </div>'''
>>> print(tostring(fromstring(doc_embed)))
<div>
<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
<embed src="http://anothersite.com/v/another"></embed>
<script src="http://www.youtube.com/example.js"></script>
<script src="/something-else.js"></script>
</div>
>>> print(Cleaner().clean_html(doc_embed))
<div>
</div>
>>> print(Cleaner(host_whitelist=['www.youtube.com']).clean_html(doc_embed))
<div>
<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
</div>
>>> print(Cleaner(host_whitelist=['www.youtube.com'], whitelist_tags=None).clean_html(doc_embed))
<div>
<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
<script src="http://www.youtube.com/example.js"></script>
</div>