@@ -197,6 +197,7 @@ def addMeta(self, dedupe, base=None):
197
197
meta = fragment_fromstring (u'<p class="econtextmax meta {}">{}</p>' .format (prop , re .sub ("<.*?>" , '' , metacontent )))
198
198
except :
199
199
#zlog.debug(u"metacontent {}: {}".format(prop, metacontent))
200
+ pass
200
201
base .insert (0 , meta )
201
202
#zlog.debug(u" ** Found meta: {}".format(tounicode(meta)))
202
203
dedupe [prop [prop .find (':' )+ 1 :]] = metacontent
@@ -332,10 +333,10 @@ def select_best_candidate(self, candidates):
332
333
for candidate in sorted_candidates [:5 ]:
333
334
elem = candidate ['elem' ]
334
335
#zlog.debug(u"Top 5 : %6.3f %s" % (candidate['content_score'], describe(elem)))
335
-
336
+
336
337
if len (sorted_candidates ) == 0 :
337
338
return None
338
-
339
+
339
340
best_candidate = sorted_candidates [0 ]
340
341
return best_candidate
341
342
@@ -395,7 +396,7 @@ def score_paragraphs(self, ):
395
396
score = candidate ['content_score' ]
396
397
#zlog.debug(u"Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score * (1 - ld)))
397
398
candidate ['content_score' ] *= (1 - ld )
398
-
399
+
399
400
return candidates
400
401
401
402
def class_weight (self , e ):
@@ -404,18 +405,18 @@ def class_weight(self, e):
404
405
if REGEXES ['negativeRe' ].search (e .get ('class' )):
405
406
#zlog.debug(u"debiting score for negativeRe in class {}".format(describe(e)))
406
407
weight -= 35 * len (REGEXES ['negativeRe' ].findall (e .get ('class' )))
407
-
408
+
408
409
if REGEXES ['positiveRe' ].search (e .get ('class' )):
409
410
weight += 25 * len (REGEXES ['positiveRe' ].findall (e .get ('class' )))
410
-
411
+
411
412
if e .get ('id' , None ):
412
413
if REGEXES ['negativeRe' ].search (e .get ('id' )):
413
414
#zlog.debug(u"debiting score for negativeRe in id {}".format(describe(e)))
414
415
weight -= 35 * len (REGEXES ['negativeRe' ].findall (e .get ('id' )))
415
-
416
+
416
417
if REGEXES ['positiveRe' ].search (e .get ('id' )):
417
418
weight += 25 * len (REGEXES ['positiveRe' ].findall (e .get ('id' )))
418
-
419
+
419
420
return weight
420
421
421
422
def score_node (self , elem ):
@@ -457,7 +458,7 @@ def remove_unlikely_candidates(self):
457
458
#zlog.debug(u"Removing hidden content - %s" % describe(elem))
458
459
to_remove .append (elem )
459
460
continue
460
-
461
+
461
462
for elem in to_remove :
462
463
elem .drop_tree ()
463
464
@@ -544,7 +545,7 @@ def sanitize(self, node, candidates):
544
545
for kind in ['p' , 'img' , 'li' , 'a' , 'embed' , 'input' ]:
545
546
counts [kind ] = len (el .findall ('.//%s' % kind ))
546
547
counts ["li" ] -= 100
547
-
548
+
548
549
# Count the text length excluding any surrounding whitespace
549
550
content_length = text_length (el )
550
551
link_density = self .get_link_density (el )
@@ -633,7 +634,7 @@ def sanitize(self, node, candidates):
633
634
#zlog.debug(u"Allowing %s" % describe(el))
634
635
for desnode in self .tags (el , "table" , "ul" , "div" ):
635
636
allowed [desnode ] = True
636
-
637
+
637
638
if to_remove :
638
639
#zlog.debug(u"Cleaned %6.3f %s with weight %s cause it has %s." % (content_score, describe(el), weight, reason))
639
640
#print tounicode(el)
0 commit comments