|
28 | 28 | import slybot
|
29 | 29 |
|
30 | 30 | from collections import defaultdict
|
31 |
| -from itertools import chain, groupby |
| 31 | +from itertools import chain, combinations, groupby |
32 | 32 | from operator import itemgetter
|
33 | 33 | from random import Random
|
34 | 34 | from urllib import unquote
|
|
41 | 41 | SLYBOT_VERSION = slybot.__version__
|
42 | 42 | IGNORE_ATTRIBUTES = ['data-scrapy-ignore', 'data-scrapy-ignore-beneath']
|
43 | 43 | _ID_RE = re.compile('([0-9a-f]{4}-){2}[0-9a-f]{4}')
|
| 44 | +_TABLE_RE = re.compile('((?:^|\s)table[^\s>]*[\s])', re.I) |
44 | 45 |
|
45 | 46 |
|
46 | 47 | def short_guid():
|
@@ -213,6 +214,31 @@ def css_escape(s):
|
213 | 214 | return s
|
214 | 215 |
|
215 | 216 |
|
| 217 | +def find_generalized_css_selector(elem, sel): |
| 218 | + selector = find_css_selector(elem, sel) |
| 219 | + return ', '.join(handle_tables(s) for s in selector.split(', ')) |
| 220 | + |
| 221 | + |
| 222 | +def handle_tables(selector): |
| 223 | + generalized, has_parts = [], False |
| 224 | + for part in (part.strip() for part in _TABLE_RE.split(selector) if part): |
| 225 | + if has_parts and part.startswith('table'): |
| 226 | + generalized[-1] = ' '.join((generalized[-1], part)) |
| 227 | + else: |
| 228 | + generalized.append(part) |
| 229 | + has_parts = True |
| 230 | + selectors = [] |
| 231 | + combined = (combinations(generalized[:-1], i + 1) |
| 232 | + for i in range(len(generalized) - 1)) |
| 233 | + for section in chain(*combined): |
| 234 | + sel = generalized[:] |
| 235 | + for selection in section: |
| 236 | + idx = sel.index(selection) |
| 237 | + sel[idx] = sel[idx] + ' > *' |
| 238 | + selectors.append(' '.join(sel)) |
| 239 | + return ', '.join([selector] + selectors) |
| 240 | + |
| 241 | + |
216 | 242 | def find_css_selector(elem, sel, depth=0, previous_tbody=False):
|
217 | 243 | """Find a unique selector for an element.
|
218 | 244 |
|
@@ -260,15 +286,17 @@ def build_table_selector(elem):
|
260 | 286 | for class_name in classes:
|
261 | 287 | selector = '.%s' % css_escape(class_name)
|
262 | 288 | matches = sel.css(selector)
|
263 |
| - if len(matches) == 1: |
| 289 | + if len(matches) == 1 and tag_name != 'table': |
264 | 290 | return selector
|
265 | 291 | # Maybe it's unique with a tag name?
|
266 | 292 | selector = tag_name + selector
|
267 | 293 | matches = sel.css(selector)
|
268 | 294 | if len(matches) == 1:
|
269 | 295 | return selector
|
| 296 | + tag = tag_name if tag_name == 'table' else '' |
| 297 | + child_idx = children_index(elem) |
270 | 298 | # Maybe it's unique using a tag name and nth-child
|
271 |
| - selector = '%s:nth-child(%s)' % (selector, children_index(elem)) |
| 299 | + selector = '%s%s:nth-child(%s)' % (tag, selector, child_idx) |
272 | 300 | matches = sel.css(selector)
|
273 | 301 | if len(matches) == 1:
|
274 | 302 | return selector
|
@@ -407,7 +435,7 @@ def _create_container(element, container_id, repeated=False, siblings=0,
|
407 | 435 | if isinstance(element, str):
|
408 | 436 | s = element
|
409 | 437 | else:
|
410 |
| - s = find_css_selector(element, selector) |
| 438 | + s = find_generalized_css_selector(element, selector) |
411 | 439 | data = {
|
412 | 440 | 'id': '%s%s' % (container_id, '#parent' if repeated else ''),
|
413 | 441 | 'container_id': None,
|
@@ -465,7 +493,7 @@ def port_generated(generated_annotations, sel):
|
465 | 493 | continue
|
466 | 494 | pre_text, post_text = '', ''
|
467 | 495 | if annotation.get('insert_after'):
|
468 |
| - selector = find_css_selector(element.getparent(), sel) |
| 496 | + selector = find_generalized_css_selector(element.getparent(), sel) |
469 | 497 | if not selector:
|
470 | 498 | continue
|
471 | 499 | annotation['accept_selectors'] = [selector]
|
@@ -512,7 +540,7 @@ def port_standard(standard_annotations, sel, sample, extractors):
|
512 | 540 | element = find_element(annotation, sel)
|
513 | 541 | if element is None:
|
514 | 542 | continue
|
515 |
| - selector = find_css_selector(element, sel) |
| 543 | + selector = find_generalized_css_selector(element, sel) |
516 | 544 | if not selector:
|
517 | 545 | continue
|
518 | 546 | annotation['accept_selectors'] = [selector]
|
|
0 commit comments