Skip to content

Commit 4e55129

Browse files
authored
Merge pull request #673 from scrapinghub/slybot_fix_table_selectors_in_migration
Fix how tables are handled in sample migration
2 parents 129aa46 + fb27c4d commit 4e55129

File tree

2 files changed

+67
-6
lines changed

2 files changed

+67
-6
lines changed

slybot/slybot/plugins/scrapely_annotations/migration.py

+34-6
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
import slybot
2929

3030
from collections import defaultdict
31-
from itertools import chain, groupby
31+
from itertools import chain, combinations, groupby
3232
from operator import itemgetter
3333
from random import Random
3434
from urllib import unquote
@@ -41,6 +41,7 @@
4141
SLYBOT_VERSION = slybot.__version__
4242
IGNORE_ATTRIBUTES = ['data-scrapy-ignore', 'data-scrapy-ignore-beneath']
4343
_ID_RE = re.compile('([0-9a-f]{4}-){2}[0-9a-f]{4}')
44+
_TABLE_RE = re.compile('((?:^|\s)table[^\s>]*[\s])', re.I)
4445

4546

4647
def short_guid():
@@ -213,6 +214,31 @@ def css_escape(s):
213214
return s
214215

215216

217+
def find_generalized_css_selector(elem, sel):
218+
selector = find_css_selector(elem, sel)
219+
return ', '.join(handle_tables(s) for s in selector.split(', '))
220+
221+
222+
def handle_tables(selector):
223+
generalized, has_parts = [], False
224+
for part in (part.strip() for part in _TABLE_RE.split(selector) if part):
225+
if has_parts and part.startswith('table'):
226+
generalized[-1] = ' '.join((generalized[-1], part))
227+
else:
228+
generalized.append(part)
229+
has_parts = True
230+
selectors = []
231+
combined = (combinations(generalized[:-1], i + 1)
232+
for i in range(len(generalized) - 1))
233+
for section in chain(*combined):
234+
sel = generalized[:]
235+
for selection in section:
236+
idx = sel.index(selection)
237+
sel[idx] = sel[idx] + ' > *'
238+
selectors.append(' '.join(sel))
239+
return ', '.join([selector] + selectors)
240+
241+
216242
def find_css_selector(elem, sel, depth=0, previous_tbody=False):
217243
"""Find a unique selector for an element.
218244
@@ -260,15 +286,17 @@ def build_table_selector(elem):
260286
for class_name in classes:
261287
selector = '.%s' % css_escape(class_name)
262288
matches = sel.css(selector)
263-
if len(matches) == 1:
289+
if len(matches) == 1 and tag_name != 'table':
264290
return selector
265291
# Maybe it's unique with a tag name?
266292
selector = tag_name + selector
267293
matches = sel.css(selector)
268294
if len(matches) == 1:
269295
return selector
296+
tag = tag_name if tag_name == 'table' else ''
297+
child_idx = children_index(elem)
270298
# Maybe it's unique using a tag name and nth-child
271-
selector = '%s:nth-child(%s)' % (selector, children_index(elem))
299+
selector = '%s%s:nth-child(%s)' % (tag, selector, child_idx)
272300
matches = sel.css(selector)
273301
if len(matches) == 1:
274302
return selector
@@ -407,7 +435,7 @@ def _create_container(element, container_id, repeated=False, siblings=0,
407435
if isinstance(element, str):
408436
s = element
409437
else:
410-
s = find_css_selector(element, selector)
438+
s = find_generalized_css_selector(element, selector)
411439
data = {
412440
'id': '%s%s' % (container_id, '#parent' if repeated else ''),
413441
'container_id': None,
@@ -465,7 +493,7 @@ def port_generated(generated_annotations, sel):
465493
continue
466494
pre_text, post_text = '', ''
467495
if annotation.get('insert_after'):
468-
selector = find_css_selector(element.getparent(), sel)
496+
selector = find_generalized_css_selector(element.getparent(), sel)
469497
if not selector:
470498
continue
471499
annotation['accept_selectors'] = [selector]
@@ -512,7 +540,7 @@ def port_standard(standard_annotations, sel, sample, extractors):
512540
element = find_element(annotation, sel)
513541
if element is None:
514542
continue
515-
selector = find_css_selector(element, sel)
543+
selector = find_generalized_css_selector(element, sel)
516544
if not selector:
517545
continue
518546
annotation['accept_selectors'] = [selector]

slybot/slybot/tests/test_migration.py

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import unittest
2+
3+
from slybot.plugins.scrapely_annotations.migration import handle_tables
4+
5+
6+
class MigrationTests(unittest.TestCase):
7+
def test_table_generalization(self):
8+
selectors = [
9+
('table.mainbg > tr:nth-child(5) > td:nth-child(1) > '
10+
'table:nth-child(2) > tr:nth-child(3) > td:nth-child(1) > '
11+
'p:nth-child(1) > strong:nth-child(1)',
12+
'table.mainbg > tr:nth-child(5) > td:nth-child(1) > '
13+
'table:nth-child(2) > tr:nth-child(3) > td:nth-child(1) > '
14+
'p:nth-child(1) > strong:nth-child(1), table.mainbg > * > '
15+
'tr:nth-child(5) > td:nth-child(1) > table:nth-child(2) > '
16+
'tr:nth-child(3) > td:nth-child(1) > p:nth-child(1) > '
17+
'strong:nth-child(1), table.mainbg > tr:nth-child(5) > '
18+
'td:nth-child(1) > table:nth-child(2) > * > tr:nth-child(3) > '
19+
'td:nth-child(1) > p:nth-child(1) > strong:nth-child(1), '
20+
'table.mainbg > * > tr:nth-child(5) > td:nth-child(1) > '
21+
'table:nth-child(2) > * > tr:nth-child(3) > td:nth-child(1) > '
22+
'p:nth-child(1) > strong:nth-child(1)'),
23+
('div > p > .table > div > span', 'div > p > .table > div > span'),
24+
('div > p > #table > div > span', 'div > p > #table > div > span'),
25+
('div > p > table > div > span',
26+
'div > p > table > div > span, div > p > table > * > div > span'),
27+
('div > p > table:nth-child(4) > div > span',
28+
'div > p > table:nth-child(4) > div > span, '
29+
'div > p > table:nth-child(4) > * > div > span'),
30+
('table', 'table')
31+
]
32+
for selector, generalized in selectors:
33+
self.assertEqual(handle_tables(selector), generalized)

0 commit comments

Comments
 (0)