diff --git a/README.rst b/README.rst
index 4a9df765..7afd2504 100644
--- a/README.rst
+++ b/README.rst
@@ -113,8 +113,6 @@ Render out an Element's HTML:
>>> about.html
'
\nAbout\n\n'
-
-
Select Elements within Elements:
.. code-block:: pycon
@@ -129,7 +127,6 @@ Search for links within an element:
>>> about.absolute_links
{'http://brochure.getpython.info/', 'https://www.python.org/about/gettingstarted/', 'https://www.python.org/about/', 'https://www.python.org/about/quotes/', 'https://www.python.org/about/help/', 'https://www.python.org/about/apps/'}
-
Search for text on the page:
.. code-block:: pycon
@@ -144,7 +141,7 @@ More complex CSS Selector example (copied from Chrome dev tools):
>>> r = session.get('https://github.com/')
>>> sel = 'body > div.application-main > div.jumbotron.jumbotron-codelines > div > div > div.col-md-7.text-center.text-md-left > p'
>>> print(r.html.find(sel, first=True).text)
- GitHub is a development platform inspired by the way you work. From open source to business, you can host and review code, manage projects, and build software alongside millions of other developers.
+ GitHub is a development platform inspired by the way you work. From open source to business, you can host and review code, manage projects, and build software alongside millions of other developers.
XPath is also supported:
@@ -244,6 +241,53 @@ You can also use this library without Requests:
>>> html.links
{'https://httpbin.org'}
+Structured Data Extraction
+========================
+
+Extract structured data from repeated HTML patterns:
+
+.. code-block:: pycon
+
+ >>> from requests_html import HTMLSession, ExtractorPattern
+ >>> session = HTMLSession()
+ >>> r = session.get('https://example.com/products')
+
+ >>> # Define extraction pattern
+ >>> pattern = ExtractorPattern(
+ ... selector=".product-card",
+ ... fields={
+ ... "title": ".product-title",
+ ... "price": ".price",
+ ... "description": ".description"
+ ... },
+ ... required_fields=["title", "price"]
+ ... )
+
+ >>> # Extract structured data
+ >>> products = r.html.extract_structured_data(pattern)
+ >>> products[0]
+ {'title': 'Example Product', 'price': '$99.99', 'description': 'A great product'}
+
+You can also extract data from elements you've already selected:
+
+.. code-block:: pycon
+
+ >>> product_section = r.html.find('.products-section', first=True)
+ >>> products = product_section.extract_structured_data(pattern)
+
+The extractor supports required fields and will skip items missing those fields:
+
+.. code-block:: pycon
+
+ >>> pattern = ExtractorPattern(
+ ... selector=".article",
+ ... fields={
+ ... "title": "h2",
+ ... "date": ".published-date",
+ ... "author": ".author-name"
+ ... },
+ ... required_fields=["title", "date"] # Articles must have title and date
+ ... )
Installation
============
@@ -253,4 +297,4 @@ Installation
$ pipenv install requests-html
✨🍰✨
-Only **Python 3.6 and above** is supported.
+Only **Python 3.6 and above** is supported.
\ No newline at end of file
diff --git a/pytest.ini b/pytest.ini
index bbb15152..964dc4e7 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,4 +1,7 @@
[pytest]
markers =
render: marks tests for html render
- internet: marks tests which runs on internet pages
\ No newline at end of file
+ internet: marks tests which runs on internet pages
+asyncio_mode = strict
+python_files = test_*.py
+testpaths = tests
\ No newline at end of file
diff --git a/requests_html.py b/requests_html.py
index cd341def..351189fe 100644
--- a/requests_html.py
+++ b/requests_html.py
@@ -4,7 +4,7 @@
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures._base import TimeoutError
from functools import partial
-from typing import Set, Union, List, MutableMapping, Optional
+from typing import Set, Union, List, MutableMapping, Optional, Dict
import pyppeteer
import requests
@@ -22,6 +22,9 @@
from parse import findall, Result
from w3lib.encoding import html_to_unicode
+from dataclasses import dataclass
+
+
DEFAULT_ENCODING = 'utf-8'
DEFAULT_URL = 'https://example.org/'
DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8'
@@ -843,3 +846,76 @@ def run(self, *coros):
]
done, _ = self.loop.run_until_complete(asyncio.wait(tasks))
return [t.result() for t in done]
+
+
+@dataclass
+class ExtractorPattern:
+ selector: str
+ fields: Dict[str, str]
+ required_fields: Optional[List[str]] = None
+
+class StructuredExtractor:
+ """
+ A utility class to extract structured data from repeated HTML patterns.
+ This extends requests-html's capabilities for handling common web scraping patterns.
+ """
+
+ def __init__(self, html: HTML):
+ self.html = html
+
+ def extract_structured_data(
+ self,
+ pattern: ExtractorPattern,
+ limit: Optional[int] = None
+ ) -> List[Dict[str, str]]:
+ """
+ Extracts structured data from HTML based on defined patterns.
+
+ Args:
+ pattern: ExtractorPattern defining the selection rules
+ limit: Optional maximum number of items to extract
+
+ Returns:
+ List of dictionaries containing the extracted data
+ """
+ results = []
+ elements = self.html.find(pattern.selector, first=False)
+
+ if limit:
+ elements = elements[:limit]
+
+ for element in elements:
+ item_data = {}
+ is_valid = True
+
+ for field_name, field_selector in pattern.fields.items():
+ field_element = element.find(field_selector, first=True)
+
+ if field_element:
+ item_data[field_name] = field_element.text.strip()
+ elif pattern.required_fields and field_name in pattern.required_fields:
+ is_valid = False
+ break
+ else:
+ item_data[field_name] = ""
+
+ if is_valid:
+ results.append(item_data)
+
+ return results
+
+ @classmethod
+ def from_url(cls, url: str) -> 'StructuredExtractor':
+ """
+ Creates a StructuredExtractor instance from a URL.
+
+ Args:
+ url: The URL to fetch and parse
+
+ Returns:
+ StructuredExtractor instance
+ """
+ session = HTMLSession()
+ r = session.get(url)
+ return cls(r.html)
+
\ No newline at end of file
diff --git a/tests/test_structured_extractor.py b/tests/test_structured_extractor.py
new file mode 100644
index 00000000..f8e59606
--- /dev/null
+++ b/tests/test_structured_extractor.py
@@ -0,0 +1,154 @@
+import pytest
+from requests_html import HTML, StructuredExtractor, ExtractorPattern
+
+@pytest.fixture
+def sample_html():
+ return HTML(html='''
+
+
+
iPhone 14
+
$999
+
Latest iPhone model
+
+
+
Samsung Galaxy S23
+
$899
+
Flagship Android phone
+
+
+
Google Pixel 7
+
+
Google's flagship phone
+
+
+ ''')
+
+@pytest.fixture
+def basic_pattern():
+ return ExtractorPattern(
+ selector=".product-card",
+ fields={
+ "title": ".product-title",
+ "price": ".price",
+ "description": ".description"
+ }
+ )
+
+@pytest.fixture
+def pattern_with_required():
+ return ExtractorPattern(
+ selector=".product-card",
+ fields={
+ "title": ".product-title",
+ "price": ".price",
+ "description": ".description"
+ },
+ required_fields=["title", "price"]
+ )
+
+def test_extractor_initialization(sample_html):
+ extractor = StructuredExtractor(sample_html)
+ assert extractor.html == sample_html
+
+def test_basic_extraction(sample_html, basic_pattern):
+ extractor = StructuredExtractor(sample_html)
+ results = extractor.extract_structured_data(basic_pattern)
+
+ assert len(results) == 3
+ assert results[0]["title"] == "iPhone 14"
+ assert results[0]["price"] == "$999"
+ assert results[0]["description"] == "Latest iPhone model"
+
+def test_extraction_with_required_fields(sample_html, pattern_with_required):
+ extractor = StructuredExtractor(sample_html)
+ results = extractor.extract_structured_data(pattern_with_required)
+
+ # Should only return 2 items since the third is missing the required price
+ assert len(results) == 2
+ assert all("price" in item for item in results)
+
+def test_extraction_with_limit(sample_html, basic_pattern):
+ extractor = StructuredExtractor(sample_html)
+ results = extractor.extract_structured_data(basic_pattern, limit=1)
+
+ assert len(results) == 1
+ assert results[0]["title"] == "iPhone 14"
+
+def test_missing_optional_field(sample_html):
+ pattern = ExtractorPattern(
+ selector=".product-card",
+ fields={
+ "title": ".product-title",
+ "nonexistent": ".nonexistent-class"
+ }
+ )
+
+ extractor = StructuredExtractor(sample_html)
+ results = extractor.extract_structured_data(pattern)
+
+ assert len(results) == 3
+ assert all(item["nonexistent"] == "" for item in results)
+
+def test_invalid_selector(sample_html):
+ pattern = ExtractorPattern(
+ selector=".nonexistent-container",
+ fields={
+ "title": ".product-title"
+ }
+ )
+
+ extractor = StructuredExtractor(sample_html)
+ results = extractor.extract_structured_data(pattern)
+
+ assert len(results) == 0
+
+def test_empty_html():
+ """Test extraction with empty HTML"""
+ html = HTML(html='')
+ extractor = StructuredExtractor(html)
+
+ pattern = ExtractorPattern(
+ selector=".product-card",
+ fields={
+ "title": ".product-title"
+ }
+ )
+
+ results = extractor.extract_structured_data(pattern)
+ assert len(results) == 0
+
+def test_pattern_without_required_fields():
+ """Test pattern initialization without required fields"""
+ pattern = ExtractorPattern(
+ selector=".product-card",
+ fields={
+ "title": ".product-title"
+ }
+ )
+ assert pattern.required_fields is None
+
+def test_from_url(requests_mock):
+ """Test creating extractor from URL"""
+ html_content = '''
+
+
Test Product
+ $100
+
+ '''
+ requests_mock.get("https://example.com", text=html_content)
+
+ extractor = StructuredExtractor.from_url("https://example.com")
+ assert isinstance(extractor, StructuredExtractor)
+
+ pattern = ExtractorPattern(
+ selector=".product-card",
+ fields={
+ "title": ".product-title",
+ "price": ".price"
+ }
+ )
+
+ results = extractor.extract_structured_data(pattern)
+ assert len(results) == 1
+ assert results[0]["title"] == "Test Product"
+ assert results[0]["price"] == "$100"
\ No newline at end of file