17
17
from .htmls import get_title
18
18
from .htmls import shorten_title
19
19
20
- zlog = logging .getLogger ('zenya ' )
20
+ zlog = logging .getLogger ('econtext.text ' )
21
21
22
22
# Python 2.7 compatibility.
23
23
if sys .version < '3' :
24
24
str = unicode
25
25
26
26
REGEXES = {
27
- 'unlikelyCandidatesRe' : re .compile ('ad-break|agegate|cart|combx|comment|community|disclaimer|disqus|extra|foot|header|hidden|legal|menu|modal|nav|pager|pagination|polic|popup|reference|remark|review|rss|shoutbox|sidebar|slideshow|sponsor|toc|tweet|twitter|video|warranty' , re .I ),
28
- 'okMaybeItsACandidateRe' : re .compile ('econtextmax|and|article|body|column|content|main|shadow|product|feature|detail|spec|about' , re .I ),
29
- 'positiveRe' : re .compile ('econtextmax|and|article|body|column|content|main|shadow|product|feature|detail|spec|about|itemprop|text' , re .I ),
30
- 'negativeRe' : re .compile ('ad|ad-break|agegate|cart|citation|combx|comment|community|disclaimer|disqus|extra|feedback|foot|form|fulfillment|header|hidden|legal|menu|modal|nav|pager|pagination|placeholder|polic|popup|qa|question|reference|remark|return|review|rss|shoutbox|sidebar|slideshow|small|sponsor|toc|tweet|twitter|video|warranty' , re .I ),
31
- 'divToPElementsRe' : re .compile ('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)' , re .I ),
32
- 'negativeStyles' : re .compile ('display:.?none|visibility:.?hidden' , re .I )
27
+ 'unlikelyCandidatesRe' : re .compile ('ad-break|agegate|cart|combx|comment|community|disclaimer|disqus|extra|foot|header|hidden|legal|menu|modal|nav|pager|pagination|polic|popup|reference|remark|review|rss|shoutbox|sidebar|slideshow|sponsor|toc|tweet|twitter|video|warranty' , re .I ),
28
+ 'okMaybeItsACandidateRe' : re .compile ('econtextmax|and|article|body|column|content|main|shadow|product|feature|detail|spec|about|text|story ' , re .I ),
29
+ 'positiveRe' : re .compile ('econtextmax|and|article|body|column|content|main|shadow|product|feature|detail|spec|about|itemprop|text|story|story-content ' , re .I ),
30
+ 'negativeRe' : re .compile ('ad|ad-break|agegate|cart|citation|combx|comment|community|disclaimer|disqus|extra|feedback|foot|form|fulfillment|header|hidden|item |legal|menu|modal|nav|pager|pagination|placeholder|polic|popup|qa|question|reference|remark|return|review|rss|shoutbox|sidebar|slideshow|small|sponsor|toc|tweet|twitter|video|warranty' , re .I ),
31
+ 'divToPElementsRe' : re .compile ('<(a|article| blockquote|dl|div|img|ol|p|pre|table|ul|main )' , re .I ),
32
+ 'negativeStyles' : re .compile ('display:.?none|visibility:.?hidden' , re .I )
33
33
#'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
34
34
#'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
35
35
#'trimRe': re.compile('^\s+|\s+$/'),
@@ -100,7 +100,7 @@ class Document:
100
100
101
101
METAPROPS = ['description' , 'title' , 'keywords' , 'og:title' , 'og:description' , 'twitter:description' , 'twitter:title' ]
102
102
ITEMPROPS = ['model' , 'brand' , 'description' , 'name' ]
103
- BADTAGS = ['nav ' , 'footer ' , 'header ' , 'aside' ]
103
+ BADTAGS = ['footer ' , 'header ' , 'nav ' , 'aside' , 'script' , 'style ' ]
104
104
105
105
def __init__ (self , input , ** options ):
106
106
"""Generate the document
@@ -211,6 +211,9 @@ def addProps(self, dedupe, base=None):
211
211
base = self .html .find (".//body" )
212
212
for elem in self .html .xpath (".//*[@itemprop]" ):
213
213
if elem .attrib .get ('itemprop' ) in self .ITEMPROPS :
214
+ ancestors = set (a .tag for a in elem .iterancestors ())
215
+ if len (ancestors .intersection (set (Document .BADTAGS ))) > 0 :
216
+ continue
214
217
metacontent = elem .attrib .get ('content' , elem .text_content ().strip ())
215
218
if dedupe .get (elem .attrib .get ('itemprop' )) != metacontent :
216
219
meta = fragment_fromstring (u'<p class="econtextmax itemprop {}">{}</p>' .format (elem .attrib .get ('itemprop' ), re .sub ("<.*?>" , '' , metacontent )))
@@ -231,7 +234,7 @@ def summary(self, html_partial=False):
231
234
while True :
232
235
self ._html (True )
233
236
to_drop = []
234
- for i in self .tags (self .html , 'script' , 'style' ):
237
+ for i in self .tags (self .html , * self . BADTAGS ):
235
238
to_drop .append (i )
236
239
for i in to_drop :
237
240
i .drop_tree ()
@@ -651,10 +654,10 @@ def sanitize(self, node, candidates):
651
654
if el .getparent () is not None :
652
655
el .drop_tree ()
653
656
654
- for el in ([node ] + [n for n in node .iter ()]):
655
- if not self .options .get ('attributes' , None ):
656
- #el.attrib = {} #FIXME:Checkout the effects of disabling this
657
- pass
657
+ # for el in ([node] + [n for n in node.iter()]):
658
+ # if not self.options.get('attributes', None):
659
+ # #el.attrib = {} #FIXME:Checkout the effects of disabling this
660
+ # pass
658
661
659
662
self .html = node
660
663
return self .get_clean_html ()
0 commit comments