Skip to content

Commit c180a05

Browse files
committed
Fix data structure of nested HTML/XML steps
When a child step is nested in the `extract()` method of an `Html` or `Xml` step, and does not use `each()` as the base, the extracted value is an array with the keys defined in the `extract()` call, rather than an array of such arrays as it would be with `each()` as base.
1 parent b576fa8 commit c180a05

File tree

4 files changed

+190
-1
lines changed

4 files changed

+190
-1
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
66

77
## [Unreleased]
88

9+
## [3.2.5] - 2025-02-26
10+
### Fixed
11+
* When a child step is nested in the `extract()` method of an `Html` or `Xml` step, and does not use `each()` as the base, the extracted value is an array with the keys defined in the `extract()` call, rather than an array of such arrays as it would be with `each()` as base.
12+
913
## [3.2.4] - 2025-02-25
1014
### Fixed
1115
* Trying to load a relative reference URI (no scheme and host/authority, only path) via the `HttpLoader` now immediately logs (or throws when `loadOrFail()` is used) an error instead of trying to actually load it.

src/Steps/Dom.php

+18-1
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ protected function mapProperties(Node $node): array
209209
if ($domQuery instanceof Dom) {
210210
$domQuery->baseUrl = $this->baseUrl;
211211

212-
$mappedProperties[$key] = iterator_to_array($domQuery->invoke($node));
212+
$mappedProperties[$key] = $this->getDataFromChildDomStep($domQuery, $node);
213213
} else {
214214
if (is_string($domQuery)) {
215215
$domQuery = $this->makeDefaultDomQueryInstance($domQuery);
@@ -249,4 +249,21 @@ protected function getBase(DomDocument|Node $document): null|Node|NodeList
249249

250250
throw new Exception('Invalid state: no base selector');
251251
}
252+
253+
/**
254+
* @return mixed[]
255+
* @throws Exception
256+
*/
257+
protected function getDataFromChildDomStep(Dom $step, Node $node): array
258+
{
259+
$childValue = iterator_to_array($step->invoke($node));
260+
261+
// When the child step was not used with each() as base and the result is an array with one
262+
// element (index/key "0") being an array, use that child array.
263+
if (!$step->each && count($childValue) === 1 && isset($childValue[0]) && is_array($childValue[0])) {
264+
return $childValue[0];
265+
}
266+
267+
return $childValue;
268+
}
252269
}

tests/Steps/HtmlTest.php

+80
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,86 @@ function () {
156156
},
157157
);
158158

159+
test(
160+
'When a child step is nested in the extraction and does not use each(), the extracted value is an array with ' .
161+
'the keys defined in extract(), rather than an array of such arrays as it would be with each().',
162+
function () {
163+
$xml = <<<HTML
164+
<!DOCTYPE html>
165+
<html lang="en">
166+
<head><title>something</title></head>
167+
<body>
168+
<div class="company">
169+
<div class="name">ABCDEFGmbH</div>
170+
<div class="founded">1984</div>
171+
<div class="location">
172+
<span class="country">Germany</span>, <span class="city">Frankfurt</span>
173+
</div>
174+
</div>
175+
<div class="company">
176+
<div class="name">Saubär GmbH</div>
177+
<div class="founded">2014</div>
178+
<div class="location">
179+
<span class="country">Austria</span>, <span class="city">Klagenfurt</span>
180+
</div>
181+
</div>
182+
</body>
183+
</html>
184+
HTML;
185+
186+
$expectedCompany1 = [
187+
'name' => 'ABCDEFGmbH',
188+
'founded' => '1984',
189+
'location' => ['country' => 'Germany', 'city' => 'Frankfurt'],
190+
];
191+
192+
$expectedCompany2 = [
193+
'name' => 'Saubär GmbH',
194+
'founded' => '2014',
195+
'location' => ['country' => 'Austria', 'city' => 'Klagenfurt'],
196+
];
197+
198+
// With base root()
199+
$step = Html::each('.company')->extract([
200+
'name' => '.name',
201+
'founded' => '.founded',
202+
'location' => Html::root()->extract(['country' => '.location .country', 'city' => '.location .city']),
203+
]);
204+
205+
$outputs = helper_invokeStepWithInput($step, $xml);
206+
207+
expect($outputs)->toHaveCount(2)
208+
->and($outputs[0]->get())->toBe($expectedCompany1)
209+
->and($outputs[1]->get())->toBe($expectedCompany2);
210+
211+
// With base first()
212+
$step = Html::each('.company')->extract([
213+
'name' => '.name',
214+
'founded' => '.founded',
215+
'location' => Html::first('.location')->extract(['country' => '.country', 'city' => '.city']),
216+
]);
217+
218+
$outputs = helper_invokeStepWithInput($step, $xml);
219+
220+
expect($outputs)->toHaveCount(2)
221+
->and($outputs[0]->get())->toBe($expectedCompany1)
222+
->and($outputs[1]->get())->toBe($expectedCompany2);
223+
224+
// With base last()
225+
$step = Html::each('.company')->extract([
226+
'name' => '.name',
227+
'founded' => '.founded',
228+
'location' => Html::last('.location')->extract(['country' => '.country', 'city' => '.city']),
229+
]);
230+
231+
$outputs = helper_invokeStepWithInput($step, $xml);
232+
233+
expect($outputs)->toHaveCount(2)
234+
->and($outputs[0]->get())->toBe($expectedCompany1)
235+
->and($outputs[1]->get())->toBe($expectedCompany2);
236+
},
237+
);
238+
159239
test(
160240
'when selecting elements with each(), you can reference the element already selected within the each() selector ' .
161241
'itself, in sub selectors',

tests/Steps/XmlTest.php

+88
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,94 @@ function () {
156156
},
157157
);
158158

159+
test(
160+
'When a child step is nested in the extraction and does not use each(), the extracted value is an array with ' .
161+
'the keys defined in extract(), rather than an array of such arrays as it would be with each().',
162+
function () {
163+
$xml = <<<XML
164+
<?xml version="1.0" encoding="UTF-8"?>
165+
<companies>
166+
<company>
167+
<name>ABCDEFGmbH</name>
168+
<founded year="1984">foo</founded>
169+
<location>
170+
<country>Germany</country>
171+
<city>Frankfurt</city>
172+
</location>
173+
</company>
174+
<company>
175+
<name>Saubär GmbH</name>
176+
<founded year="2014">bar</founded>
177+
<location>
178+
<country>Austria</country>
179+
<city>Klagenfurt</city>
180+
</location>
181+
</company>
182+
</companies>
183+
XML;
184+
185+
$expectedCompany1 = [
186+
'name' => 'ABCDEFGmbH',
187+
'founded' => '1984',
188+
'location' => ['country' => 'Germany', 'city' => 'Frankfurt'],
189+
];
190+
191+
$expectedCompany2 = [
192+
'name' => 'Saubär GmbH',
193+
'founded' => '2014',
194+
'location' => ['country' => 'Austria', 'city' => 'Klagenfurt'],
195+
];
196+
197+
// With base root()
198+
$step = Xml::each(Dom::xPath('//companies/company'))->extract([
199+
'name' => Dom::cssSelector('name')->text(),
200+
'founded' => Dom::xPath('//founded')->attribute('year'),
201+
'location' => Xml::root()->extract([
202+
'country' => Dom::xPath('//location/country')->text(),
203+
'city' => Dom::cssSelector('location city')->text(),
204+
]),
205+
]);
206+
207+
$outputs = helper_invokeStepWithInput($step, $xml);
208+
209+
expect($outputs)->toHaveCount(2)
210+
->and($outputs[0]->get())->toBe($expectedCompany1)
211+
->and($outputs[1]->get())->toBe($expectedCompany2);
212+
213+
// With base first()
214+
$step = Xml::each(Dom::xPath('//companies/company'))->extract([
215+
'name' => Dom::cssSelector('name')->text(),
216+
'founded' => Dom::xPath('//founded')->attribute('year'),
217+
'location' => Xml::first(Dom::cssSelector('location'))->extract([
218+
'country' => Dom::xPath('//country')->text(),
219+
'city' => Dom::cssSelector('city')->text(),
220+
]),
221+
]);
222+
223+
$outputs = helper_invokeStepWithInput($step, $xml);
224+
225+
expect($outputs)->toHaveCount(2)
226+
->and($outputs[0]->get())->toBe($expectedCompany1)
227+
->and($outputs[1]->get())->toBe($expectedCompany2);
228+
229+
// With base last()
230+
$step = Xml::each(Dom::xPath('//companies/company'))->extract([
231+
'name' => Dom::cssSelector('name')->text(),
232+
'founded' => Dom::xPath('//founded')->attribute('year'),
233+
'location' => Xml::last(Dom::cssSelector('location'))->extract([
234+
'country' => Dom::xPath('//country')->text(),
235+
'city' => Dom::cssSelector('city')->text(),
236+
]),
237+
]);
238+
239+
$outputs = helper_invokeStepWithInput($step, $xml);
240+
241+
expect($outputs)->toHaveCount(2)
242+
->and($outputs[0]->get())->toBe($expectedCompany1)
243+
->and($outputs[1]->get())->toBe($expectedCompany2);
244+
},
245+
);
246+
159247
it('works when the response string starts with an UTF-8 byte order mark character', function () {
160248
$response = new RespondedRequest(
161249
new Request('GET', 'https://www.example.com/rss'),

0 commit comments

Comments
 (0)