Skip to content

Commit 43793f1

Browse files
authored
Merge pull request #3 from ScrapeGraphAI/pre/beta
Pre/beta
2 parents d3ce047 + 9da0f95 commit 43793f1

File tree

7 files changed

+247
-17
lines changed

7 files changed

+247
-17
lines changed

CHANGELOG.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,30 @@
1+
## [1.2.0-beta.1](https://github.com/ScrapeGraphAI/langchain-scrapegraph/compare/v1.1.0...v1.2.0-beta.1) (2024-12-18)
2+
3+
4+
### Features
5+
6+
* added pydantic output schema 🔍 ([34b5f10](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/34b5f1089059daa25c756b44da593a7c0db97aa9))
7+
8+
9+
### Docs
10+
11+
* added API reference ([d3ce047](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/d3ce0470f5c89da910540e42d71afdddd80e8c15))
12+
13+
14+
### CI
15+
16+
* **release:** 1.1.0-beta.1 [skip ci] ([6222a16](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/6222a16a2fec477e7a6e610e0fdd4960e7ccd1b5))
17+
* **release:** 1.1.0-beta.2 [skip ci] ([d5dae57](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/d5dae575921cfa14daa4ceb887b0d7d037d3773d))
18+
19+
## [1.1.0-beta.2](https://github.com/ScrapeGraphAI/langchain-scrapegraph/compare/v1.1.0-beta.1...v1.1.0-beta.2) (2024-12-18)
20+
21+
22+
### Features
23+
24+
* added pydantic output schema 🔍 ([34b5f10](https://github.com/ScrapeGraphAI/langchain-scrapegraph/commit/34b5f1089059daa25c756b44da593a7c0db97aa9))
25+
26+
## [1.1.0-beta.1](https://github.com/ScrapeGraphAI/langchain-scrapegraph/compare/v1.0.0...v1.1.0-beta.1) (2024-12-05)
27+
128
## [1.1.0](https://github.com/ScrapeGraphAI/langchain-scrapegraph/compare/v1.0.0...v1.1.0) (2024-12-05)
229

330

README.md

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,39 @@ result = tool.invoke({
5858
print(result)
5959
```
6060

61+
<details>
62+
<summary>🔍 Using Output Schemas with SmartscraperTool</summary>
63+
64+
You can define the structure of the output using Pydantic models:
65+
66+
```python
67+
from typing import List
68+
from pydantic import BaseModel, Field
69+
from langchain_scrapegraph.tools import SmartscraperTool
70+
71+
class WebsiteInfo(BaseModel):
72+
title: str = Field(description="The main title of the webpage")
73+
description: str = Field(description="The main description or first paragraph")
74+
urls: List[str] = Field(description="The URLs inside the webpage")
75+
76+
# Initialize with schema
77+
tool = SmartscraperTool(llm_output_schema=WebsiteInfo)
78+
79+
# The output will conform to the WebsiteInfo schema
80+
result = tool.invoke({
81+
"website_url": "https://www.example.com",
82+
"user_prompt": "Extract the website information"
83+
})
84+
85+
print(result)
86+
# {
87+
# "title": "Example Domain",
88+
# "description": "This domain is for use in illustrative examples...",
89+
# "urls": ["https://www.iana.org/domains/example"]
90+
# }
91+
```
92+
</details>
93+
6194
### 💻 LocalscraperTool
6295
Extract information from HTML content using AI.
6396

@@ -73,6 +106,54 @@ result = tool.invoke({
73106
print(result)
74107
```
75108

109+
<details>
110+
<summary>🔍 Using Output Schemas with LocalscraperTool</summary>
111+
112+
You can define the structure of the output using Pydantic models:
113+
114+
```python
115+
from typing import Optional
116+
from pydantic import BaseModel, Field
117+
from langchain_scrapegraph.tools import LocalscraperTool
118+
119+
class CompanyInfo(BaseModel):
120+
name: str = Field(description="The company name")
121+
description: str = Field(description="The company description")
122+
email: Optional[str] = Field(description="Contact email if available")
123+
phone: Optional[str] = Field(description="Contact phone if available")
124+
125+
# Initialize with schema
126+
tool = LocalscraperTool(llm_output_schema=CompanyInfo)
127+
128+
html_content = """
129+
<html>
130+
<body>
131+
<h1>TechCorp Solutions</h1>
132+
<p>We are a leading AI technology company.</p>
133+
<div class="contact">
134+
<p>Email: [email protected]</p>
135+
<p>Phone: (555) 123-4567</p>
136+
</div>
137+
</body>
138+
</html>
139+
"""
140+
141+
# The output will conform to the CompanyInfo schema
142+
result = tool.invoke({
143+
"website_html": html_content,
144+
"user_prompt": "Extract the company information"
145+
})
146+
147+
print(result)
148+
# {
149+
# "name": "TechCorp Solutions",
150+
# "description": "We are a leading AI technology company.",
151+
# "email": "[email protected]",
152+
# "phone": "(555) 123-4567"
153+
# }
154+
```
155+
</details>
156+
76157
## 🌟 Key Features
77158

78159
- 🐦 **LangChain Integration**: Seamlessly works with LangChain agents and chains

examples/localscraper_tool_schema.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from typing import List
2+
3+
from pydantic import BaseModel, Field
4+
from scrapegraph_py.logger import sgai_logger
5+
6+
from langchain_scrapegraph.tools import LocalScraperTool
7+
8+
9+
class WebsiteInfo(BaseModel):
10+
title: str = Field(description="The main title of the webpage")
11+
description: str = Field(description="The main description or first paragraph")
12+
urls: List[str] = Field(description="The URLs inside the webpage")
13+
14+
15+
sgai_logger.set_logging(level="INFO")
16+
17+
# Initialize with Pydantic model class
18+
tool = LocalScraperTool(llm_output_schema=WebsiteInfo)
19+
20+
# Example website and prompt
21+
html_content = """
22+
<html>
23+
<body>
24+
<h1>Company Name</h1>
25+
<p>We are a technology company focused on AI solutions.</p>
26+
<div class="contact">
27+
<p>Email: [email protected]</p>
28+
<p>Phone: (555) 123-4567</p>
29+
</div>
30+
</body>
31+
</html>
32+
"""
33+
user_prompt = "Make a summary of the webpage and extract the email and phone number"
34+
35+
# Use the tool
36+
result = tool.invoke({"website_html": html_content, "user_prompt": user_prompt})
37+
38+
print(result)

examples/smartscraper_tool_schema.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from typing import List
2+
3+
from pydantic import BaseModel, Field
4+
from scrapegraph_py.logger import sgai_logger
5+
6+
from langchain_scrapegraph.tools import SmartScraperTool
7+
8+
9+
class WebsiteInfo(BaseModel):
10+
title: str = Field(description="The main title of the webpage")
11+
description: str = Field(description="The main description or first paragraph")
12+
urls: List[str] = Field(description="The URLs inside the webpage")
13+
14+
15+
sgai_logger.set_logging(level="INFO")
16+
17+
# Initialize with Pydantic model class
18+
tool = SmartScraperTool(llm_output_schema=WebsiteInfo)
19+
20+
# Example website and prompt
21+
website_url = "https://www.example.com"
22+
user_prompt = "Extract info about the website"
23+
24+
# Use the tool - output will conform to WebsiteInfo schema
25+
result = tool.invoke({"website_url": website_url, "user_prompt": user_prompt})
26+
print(result)

langchain_scrapegraph/tools/localscraper.py

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ class LocalScraperTool(BaseTool):
3737
Key init args:
3838
api_key: Your ScrapeGraph AI API key. If not provided, will look for SGAI_API_KEY env var.
3939
client: Optional pre-configured ScrapeGraph client instance.
40+
llm_output_schema: Optional Pydantic model or dictionary schema to structure the output.
41+
If provided, the tool will ensure the output conforms to this schema.
4042
4143
Instantiate:
4244
.. code-block:: python
@@ -49,6 +51,16 @@ class LocalScraperTool(BaseTool):
4951
# Or provide API key directly
5052
tool = LocalScraperTool(api_key="your-api-key")
5153
54+
# Optionally, you can provide an output schema:
55+
from pydantic import BaseModel, Field
56+
57+
class CompanyInfo(BaseModel):
58+
name: str = Field(description="Company name")
59+
description: str = Field(description="Company description")
60+
email: str = Field(description="Contact email")
61+
62+
tool_with_schema = LocalScraperTool(llm_output_schema=CompanyInfo)
63+
5264
Use the tool:
5365
.. code-block:: python
5466
@@ -71,21 +83,21 @@ class LocalScraperTool(BaseTool):
7183
})
7284
7385
print(result)
86+
# Without schema:
7487
# {
7588
# "description": "We are a technology company focused on AI solutions",
7689
# "contact": {
7790
# "email": "[email protected]",
7891
# "phone": "(555) 123-4567"
7992
# }
8093
# }
81-
82-
Async usage:
83-
.. code-block:: python
84-
85-
result = await tool.ainvoke({
86-
"user_prompt": "Extract contact information",
87-
"website_html": html_content
88-
})
94+
#
95+
# With CompanyInfo schema:
96+
# {
97+
# "name": "Company Name",
98+
# "description": "We are a technology company focused on AI solutions",
99+
# "email": "[email protected]"
100+
# }
89101
"""
90102

91103
name: str = "LocalScraper"
@@ -96,6 +108,7 @@ class LocalScraperTool(BaseTool):
96108
return_direct: bool = True
97109
client: Optional[Client] = None
98110
api_key: str
111+
llm_output_schema: Optional[Type[BaseModel]] = None
99112

100113
@model_validator(mode="before")
101114
@classmethod
@@ -117,10 +130,23 @@ def _run(
117130
"""Use the tool to extract data from a website."""
118131
if not self.client:
119132
raise ValueError("Client not initialized")
120-
response = self.client.localscraper(
121-
website_html=website_html,
122-
user_prompt=user_prompt,
123-
)
133+
134+
if self.llm_output_schema is None:
135+
response = self.client.localscraper(
136+
website_html=website_html,
137+
user_prompt=user_prompt,
138+
)
139+
elif isinstance(self.llm_output_schema, type) and issubclass(
140+
self.llm_output_schema, BaseModel
141+
):
142+
response = self.client.localscraper(
143+
website_html=website_html,
144+
user_prompt=user_prompt,
145+
output_schema=self.llm_output_schema,
146+
)
147+
else:
148+
raise ValueError("llm_output_schema must be a Pydantic model class")
149+
124150
return response["result"]
125151

126152
async def _arun(

langchain_scrapegraph/tools/smartscraper.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ class SmartScraperTool(BaseTool):
3737
Key init args:
3838
api_key: Your ScrapeGraph AI API key. If not provided, will look for SGAI_API_KEY env var.
3939
client: Optional pre-configured ScrapeGraph client instance.
40+
llm_output_schema: Optional Pydantic model or dictionary schema to structure the output.
41+
If provided, the tool will ensure the output conforms to this schema.
4042
4143
Instantiate:
4244
.. code-block:: python
@@ -49,6 +51,15 @@ class SmartScraperTool(BaseTool):
4951
# Or provide API key directly
5052
tool = SmartScraperTool(api_key="your-api-key")
5153
54+
# Optionally, you can provide an output schema:
55+
from pydantic import BaseModel, Field
56+
57+
class WebsiteInfo(BaseModel):
58+
title: str = Field(description="The main title")
59+
description: str = Field(description="The main description")
60+
61+
tool_with_schema = SmartScraperTool(llm_output_schema=WebsiteInfo)
62+
5263
Use the tool:
5364
.. code-block:: python
5465
@@ -58,10 +69,17 @@ class SmartScraperTool(BaseTool):
5869
})
5970
6071
print(result)
72+
# Without schema:
6173
# {
6274
# "main_heading": "Example Domain",
6375
# "first_paragraph": "This domain is for use in illustrative examples..."
6476
# }
77+
#
78+
# With WebsiteInfo schema:
79+
# {
80+
# "title": "Example Domain",
81+
# "description": "This domain is for use in illustrative examples..."
82+
# }
6583
6684
Async usage:
6785
.. code-block:: python
@@ -80,6 +98,7 @@ class SmartScraperTool(BaseTool):
8098
return_direct: bool = True
8199
client: Optional[Client] = None
82100
api_key: str
101+
llm_output_schema: Optional[Type[BaseModel]] = None
83102

84103
@model_validator(mode="before")
85104
@classmethod
@@ -101,10 +120,23 @@ def _run(
101120
"""Use the tool to extract data from a website."""
102121
if not self.client:
103122
raise ValueError("Client not initialized")
104-
response = self.client.smartscraper(
105-
website_url=website_url,
106-
user_prompt=user_prompt,
107-
)
123+
124+
if self.llm_output_schema is None:
125+
response = self.client.smartscraper(
126+
website_url=website_url,
127+
user_prompt=user_prompt,
128+
)
129+
elif isinstance(self.llm_output_schema, type) and issubclass(
130+
self.llm_output_schema, BaseModel
131+
):
132+
response = self.client.smartscraper(
133+
website_url=website_url,
134+
user_prompt=user_prompt,
135+
output_schema=self.llm_output_schema,
136+
)
137+
else:
138+
raise ValueError("llm_output_schema must be a Pydantic model class")
139+
108140
return response["result"]
109141

110142
async def _arun(

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "langchain-scrapegraph"
3-
version = "1.1.0"
3+
version = "1.2.0b1"
44
description = "Library for extracting structured data from websites using ScrapeGraphAI"
55
authors = ["Marco Perini <[email protected]>", "Marco Vinciguerra <[email protected]>", "Lorenzo Padoan <[email protected]>"]
66
license = "MIT"

0 commit comments

Comments
 (0)