|
2 | 2 |
|
3 | 3 | [](https://opensource.org/licenses/MIT) |
4 | 4 | [](https://pypi.org/project/langchain-scrapegraph/) |
5 | | -[](https://scrapegraphai.com/documentation) |
| 5 | +[](https://docs.scrapegraphai.com/integrations/langchain) |
6 | 6 |
|
7 | 7 | Supercharge your LangChain agents with AI-powered web scraping capabilities. LangChain-ScrapeGraph provides a seamless integration between [LangChain](https://github.com/langchain-ai/langchain) and [ScrapeGraph AI](https://scrapegraphai.com), enabling your agents to extract structured data from websites using natural language. |
8 | 8 |
|
@@ -58,98 +58,76 @@ result = tool.invoke({ |
58 | 58 | print(result) |
59 | 59 | ``` |
60 | 60 |
|
61 | | -<details> |
62 | | -<summary>🔍 Using Output Schemas with SmartscraperTool</summary> |
63 | | - |
64 | | -You can define the structure of the output using Pydantic models: |
| 61 | +### 🌐 SearchscraperTool |
| 62 | +Search and extract structured information from the web using natural language prompts. |
65 | 63 |
|
66 | 64 | ```python |
67 | | -from typing import List |
68 | | -from pydantic import BaseModel, Field |
69 | | -from langchain_scrapegraph.tools import SmartScraperTool |
| 65 | +from langchain_scrapegraph.tools import SearchScraperTool |
70 | 66 |
|
71 | | -class WebsiteInfo(BaseModel): |
72 | | - title: str = Field(description="The main title of the webpage") |
73 | | - description: str = Field(description="The main description or first paragraph") |
74 | | - urls: List[str] = Field(description="The URLs inside the webpage") |
75 | | - |
76 | | -# Initialize with schema |
77 | | -tool = SmartScraperTool(llm_output_schema=WebsiteInfo) |
| 67 | +# Initialize the tool (uses SGAI_API_KEY from environment) |
| 68 | +tool = SearchScraperTool() |
78 | 69 |
|
79 | | -# The output will conform to the WebsiteInfo schema |
| 70 | +# Search and extract information using natural language |
80 | 71 | result = tool.invoke({ |
81 | | - "website_url": "https://www.example.com", |
82 | | - "user_prompt": "Extract the website information" |
| 72 | + "user_prompt": "What are the key features and pricing of ChatGPT Plus?" |
83 | 73 | }) |
84 | 74 |
|
85 | 75 | print(result) |
86 | 76 | # { |
87 | | -# "title": "Example Domain", |
88 | | -# "description": "This domain is for use in illustrative examples...", |
89 | | -# "urls": ["https://www.iana.org/domains/example"] |
| 77 | +# "product": { |
| 78 | +# "name": "ChatGPT Plus", |
| 79 | +# "description": "Premium version of ChatGPT..." |
| 80 | +# }, |
| 81 | +# "features": [...], |
| 82 | +# "pricing": {...}, |
| 83 | +# "reference_urls": [ |
| 84 | +# "https://openai.com/chatgpt", |
| 85 | +# ... |
| 86 | +# ] |
90 | 87 | # } |
91 | 88 | ``` |
92 | | -</details> |
93 | | - |
94 | | -### 💻 LocalscraperTool |
95 | | -Extract information from HTML content using AI. |
96 | | - |
97 | | -```python |
98 | | -from langchain_scrapegraph.tools import LocalScraperTool |
99 | | - |
100 | | -tool = LocalScraperTool() |
101 | | -result = tool.invoke({ |
102 | | - "user_prompt": "Extract all contact information", |
103 | | - "website_html": "<html>...</html>" |
104 | | -}) |
105 | | - |
106 | | -print(result) |
107 | | -``` |
108 | 89 |
|
109 | 90 | <details> |
110 | | -<summary>🔍 Using Output Schemas with LocalscraperTool</summary> |
| 91 | +<summary>🔍 Using Output Schemas with SearchscraperTool</summary> |
111 | 92 |
|
112 | 93 | You can define the structure of the output using Pydantic models: |
113 | 94 |
|
114 | 95 | ```python |
115 | | -from typing import Optional |
| 96 | +from typing import List, Dict |
116 | 97 | from pydantic import BaseModel, Field |
117 | | -from langchain_scrapegraph.tools import LocalScraperTool |
| 98 | +from langchain_scrapegraph.tools import SearchScraperTool |
118 | 99 |
|
119 | | -class CompanyInfo(BaseModel): |
120 | | - name: str = Field(description="The company name") |
121 | | - description: str = Field(description="The company description") |
122 | | - email: Optional[str] = Field(description="Contact email if available") |
123 | | - phone: Optional[str] = Field(description="Contact phone if available") |
| 100 | +class ProductInfo(BaseModel): |
| 101 | + name: str = Field(description="Product name") |
| 102 | + features: List[str] = Field(description="List of product features") |
| 103 | + pricing: Dict[str, Any] = Field(description="Pricing information") |
| 104 | + reference_urls: List[str] = Field(description="Source URLs for the information") |
124 | 105 |
|
125 | 106 | # Initialize with schema |
126 | | -tool = LocalScraperTool(llm_output_schema=CompanyInfo) |
127 | | - |
128 | | -html_content = """ |
129 | | -<html> |
130 | | - <body> |
131 | | - <h1>TechCorp Solutions</h1> |
132 | | - <p>We are a leading AI technology company.</p> |
133 | | - <div class="contact"> |
134 | | - |
135 | | - <p>Phone: (555) 123-4567</p> |
136 | | - </div> |
137 | | - </body> |
138 | | -</html> |
139 | | -""" |
140 | | - |
141 | | -# The output will conform to the CompanyInfo schema |
| 107 | +tool = SearchScraperTool(llm_output_schema=ProductInfo) |
| 108 | + |
| 109 | +# The output will conform to the ProductInfo schema |
142 | 110 | result = tool.invoke({ |
143 | | - "website_html": html_content, |
144 | | - "user_prompt": "Extract the company information" |
| 111 | + "user_prompt": "What are the key features and pricing of ChatGPT Plus?" |
145 | 112 | }) |
146 | 113 |
|
147 | 114 | print(result) |
148 | 115 | # { |
149 | | -# "name": "TechCorp Solutions", |
150 | | -# "description": "We are a leading AI technology company.", |
151 | | - |
152 | | -# "phone": "(555) 123-4567" |
| 116 | +# "name": "ChatGPT Plus", |
| 117 | +# "features": [ |
| 118 | +# "GPT-4 access", |
| 119 | +# "Faster response speed", |
| 120 | +# ... |
| 121 | +# ], |
| 122 | +# "pricing": { |
| 123 | +# "amount": 20, |
| 124 | +# "currency": "USD", |
| 125 | +# "period": "monthly" |
| 126 | +# }, |
| 127 | +# "reference_urls": [ |
| 128 | +# "https://openai.com/chatgpt", |
| 129 | +# ... |
| 130 | +# ] |
153 | 131 | # } |
154 | 132 | ``` |
155 | 133 | </details> |
|
0 commit comments