Skip to content

Commit 8386149

Browse files
author
Jonathan Spalink
committed
Added in some bad tags that we might want to drop (header, footer, etc) and also checking itemprops to make sure they don't exist inside of a bad tag.
1 parent 32f3389 commit 8386149

File tree

1 file changed

+16
-13
lines changed

1 file changed

+16
-13
lines changed

readability/readability.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,19 @@
1717
from .htmls import get_title
1818
from .htmls import shorten_title
1919

20-
zlog = logging.getLogger('zenya')
20+
zlog = logging.getLogger('econtext.text')
2121

2222
# Python 2.7 compatibility.
2323
if sys.version < '3':
2424
str = unicode
2525

2626
REGEXES = {
27-
'unlikelyCandidatesRe': re.compile('ad-break|agegate|cart|combx|comment|community|disclaimer|disqus|extra|foot|header|hidden|legal|menu|modal|nav|pager|pagination|polic|popup|reference|remark|review|rss|shoutbox|sidebar|slideshow|sponsor|toc|tweet|twitter|video|warranty', re.I),
28-
'okMaybeItsACandidateRe': re.compile('econtextmax|and|article|body|column|content|main|shadow|product|feature|detail|spec|about', re.I),
29-
'positiveRe': re.compile('econtextmax|and|article|body|column|content|main|shadow|product|feature|detail|spec|about|itemprop|text', re.I),
30-
'negativeRe': re.compile('ad|ad-break|agegate|cart|citation|combx|comment|community|disclaimer|disqus|extra|feedback|foot|form|fulfillment|header|hidden|legal|menu|modal|nav|pager|pagination|placeholder|polic|popup|qa|question|reference|remark|return|review|rss|shoutbox|sidebar|slideshow|small|sponsor|toc|tweet|twitter|video|warranty', re.I),
31-
'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
32-
'negativeStyles': re.compile('display:.?none|visibility:.?hidden', re.I)
27+
'unlikelyCandidatesRe': re.compile('ad-break|agegate|cart|combx|comment|community|disclaimer|disqus|extra|foot|header|hidden|legal|menu|modal|nav|pager|pagination|polic|popup|reference|remark|review|rss|shoutbox|sidebar|slideshow|sponsor|toc|tweet|twitter|video|warranty', re.I),
28+
'okMaybeItsACandidateRe': re.compile('econtextmax|and|article|body|column|content|main|shadow|product|feature|detail|spec|about|text|story', re.I),
29+
'positiveRe': re.compile('econtextmax|and|article|body|column|content|main|shadow|product|feature|detail|spec|about|itemprop|text|story|story-content', re.I),
30+
'negativeRe': re.compile('ad|ad-break|agegate|cart|citation|combx|comment|community|disclaimer|disqus|extra|feedback|foot|form|fulfillment|header|hidden|item|legal|menu|modal|nav|pager|pagination|placeholder|polic|popup|qa|question|reference|remark|return|review|rss|shoutbox|sidebar|slideshow|small|sponsor|toc|tweet|twitter|video|warranty', re.I),
31+
'divToPElementsRe': re.compile('<(a|article|blockquote|dl|div|img|ol|p|pre|table|ul|main)', re.I),
32+
'negativeStyles': re.compile('display:.?none|visibility:.?hidden', re.I)
3333
#'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
3434
#'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
3535
#'trimRe': re.compile('^\s+|\s+$/'),
@@ -100,7 +100,7 @@ class Document:
100100

101101
METAPROPS = ['description', 'title', 'keywords', 'og:title', 'og:description', 'twitter:description', 'twitter:title']
102102
ITEMPROPS = ['model', 'brand', 'description', 'name']
103-
BADTAGS = ['nav', 'footer', 'header', 'aside']
103+
BADTAGS = ['footer', 'header', 'nav', 'aside', 'script', 'style']
104104

105105
def __init__(self, input, **options):
106106
"""Generate the document
@@ -211,6 +211,9 @@ def addProps(self, dedupe, base=None):
211211
base = self.html.find(".//body")
212212
for elem in self.html.xpath(".//*[@itemprop]"):
213213
if elem.attrib.get('itemprop') in self.ITEMPROPS:
214+
ancestors = set(a.tag for a in elem.iterancestors())
215+
if len(ancestors.intersection(set(Document.BADTAGS))) > 0:
216+
continue
214217
metacontent = elem.attrib.get('content', elem.text_content().strip())
215218
if dedupe.get(elem.attrib.get('itemprop')) != metacontent:
216219
meta = fragment_fromstring(u'<p class="econtextmax itemprop {}">{}</p>'.format(elem.attrib.get('itemprop'), re.sub("<.*?>", '', metacontent)))
@@ -231,7 +234,7 @@ def summary(self, html_partial=False):
231234
while True:
232235
self._html(True)
233236
to_drop = []
234-
for i in self.tags(self.html, 'script', 'style'):
237+
for i in self.tags(self.html, *self.BADTAGS):
235238
to_drop.append(i)
236239
for i in to_drop:
237240
i.drop_tree()
@@ -651,10 +654,10 @@ def sanitize(self, node, candidates):
651654
if el.getparent() is not None:
652655
el.drop_tree()
653656

654-
for el in ([node] + [n for n in node.iter()]):
655-
if not self.options.get('attributes', None):
656-
#el.attrib = {} #FIXME:Checkout the effects of disabling this
657-
pass
657+
#for el in ([node] + [n for n in node.iter()]):
658+
# if not self.options.get('attributes', None):
659+
# #el.attrib = {} #FIXME:Checkout the effects of disabling this
660+
# pass
658661

659662
self.html = node
660663
return self.get_clean_html()

0 commit comments

Comments
 (0)