文档: https://lxml.de/lxmlhtml.html#cleaning-up-html
代码示例
# -*- coding: utf-8 -*-
from lxml.html.clean import Cleaner
html = """
铁打的腾讯
"""
# 保存新闻的时候,很多属性不需要保存,不然会占用硬盘资源,所以只保留图片标签的src属性就行
safe_attrs = frozenset(['src'])
# a标签也不要,只保留里边的内容
remove_tags = frozenset([
'a'
])
cleaner = Cleaner(safe_attrs=safe_attrs, remove_tags=remove_tags)
cleaned_html = cleaner.clean_html(html)
print(cleaned_html)
'''
铁打的腾讯
'''
清洗之后内容简洁多了