diff --git a/.gitignore b/.gitignore index 2348049..c0b5069 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ .DS_Store **/.DS_Store *.csv +venv/ diff --git a/__pycache__/test_sdk.cpython-312.pyc b/__pycache__/test_sdk.cpython-312.pyc new file mode 100644 index 0000000..8cccba0 Binary files /dev/null and b/__pycache__/test_sdk.cpython-312.pyc differ diff --git a/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py b/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py new file mode 100644 index 0000000..a53cb53 --- /dev/null +++ b/scrapegraph-py/examples/async/async_smartscraper_infinite_scroll_example.py @@ -0,0 +1,63 @@ +import asyncio + +from scrapegraph_py import AsyncClient +from scrapegraph_py.logger import sgai_logger + +sgai_logger.set_logging(level="INFO") + + +async def scrape_companies(client: AsyncClient, url: str, batch: str) -> None: + """Scrape companies from a specific YC batch with infinite scroll.""" + try: + # Initial scrape with infinite scroll enabled + response = await client.smartscraper( + website_url=url, + user_prompt="Extract all company information from this page, including name, description, and website", + number_of_scrolls=10, + ) + # Process the results + companies = response.get("result", {}).get("companies", []) + if not companies: + print(f"No companies found for batch {batch}") + return + + # Save or process the companies data + print(f"Found {len(companies)} companies in batch {batch}") + + for company in companies: + print(f"Company: {company.get('name', 'N/A')}") + print(f"Description: {company.get('description', 'N/A')}") + print(f"Website: {company.get('website', 'N/A')}") + print("-" * 50) + + except Exception as e: + print(f"Error scraping batch {batch}: {str(e)}") + + +async def main(): + # Initialize async client + client = AsyncClient(api_key="Your-API-Key") + + try: + # Example YC batch URLs + batch_urls = { + "W24": "https://www.ycombinator.com/companies?batch=Winter%202024", + "S23": "https://www.ycombinator.com/companies?batch=Summer%202023" + } + + # Create tasks for each batch + tasks = [ + scrape_companies(client, url, batch) + for batch, url in batch_urls.items() + ] + + # Execute all batch scraping concurrently + await asyncio.gather(*tasks) + + finally: + # Ensure client is properly closed + await client.close() + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/scrapegraph-py/examples/sync/smartscraper_infinite_scroll_example.py b/scrapegraph-py/examples/sync/smartscraper_infinite_scroll_example.py new file mode 100644 index 0000000..5795936 --- /dev/null +++ b/scrapegraph-py/examples/sync/smartscraper_infinite_scroll_example.py @@ -0,0 +1,46 @@ +from scrapegraph_py import Client +from scrapegraph_py.logger import sgai_logger +from pydantic import BaseModel +from typing import List + +sgai_logger.set_logging(level="INFO") + +# Define the output schema +class Company(BaseModel): + name: str + category: str + location: str + +class CompaniesResponse(BaseModel): + companies: List[Company] + +# Initialize the client with explicit API key +sgai_client = Client(api_key="sgai-api-key") + +try: + # SmartScraper request with infinite scroll + response = sgai_client.smartscraper( + website_url="https://www.ycombinator.com/companies?batch=Spring%202025", + user_prompt="Extract all company names and their categories from the page", + output_schema=CompaniesResponse, + number_of_scrolls=10 # Scroll 10 times to load more companies + ) + + # Print the response + print(f"Request ID: {response['request_id']}") + + # Parse and print the results in a structured way + result = CompaniesResponse.model_validate(response['result']) + print("\nExtracted Companies:") + print("-" * 80) + for company in result.companies: + print(f"Name: {company.name}") + print(f"Category: {company.category}") + print(f"Location: {company.location}") + print("-" * 80) + +except Exception as e: + print(f"An error occurred: {e}") + +finally: + sgai_client.close() \ No newline at end of file diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index 99b6212..aa54e6c 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -174,6 +174,7 @@ async def smartscraper( website_html: Optional[str] = None, headers: Optional[dict[str, str]] = None, output_schema: Optional[BaseModel] = None, + number_of_scrolls: Optional[int] = None, ): """Send a smartscraper request""" logger.info("🔍 Starting smartscraper request") @@ -183,6 +184,8 @@ async def smartscraper( logger.debug("📄 Using provided HTML content") if headers: logger.debug("🔧 Using custom headers") + if number_of_scrolls is not None: + logger.debug(f"🔄 Number of scrolls: {number_of_scrolls}") logger.debug(f"📝 Prompt: {user_prompt}") request = SmartScraperRequest( @@ -191,7 +194,9 @@ async def smartscraper( headers=headers, user_prompt=user_prompt, output_schema=output_schema, + number_of_scrolls=number_of_scrolls, ) + logger.debug("✅ Request validation passed") result = await self._make_request( diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index 1168557..7cb6c3b 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -182,6 +182,7 @@ def smartscraper( website_html: Optional[str] = None, headers: Optional[dict[str, str]] = None, output_schema: Optional[BaseModel] = None, + number_of_scrolls: Optional[int] = None, ): """Send a smartscraper request""" logger.info("🔍 Starting smartscraper request") @@ -191,6 +192,8 @@ def smartscraper( logger.debug("📄 Using provided HTML content") if headers: logger.debug("🔧 Using custom headers") + if number_of_scrolls is not None: + logger.debug(f"🔄 Number of scrolls: {number_of_scrolls}") logger.debug(f"📝 Prompt: {user_prompt}") request = SmartScraperRequest( @@ -199,6 +202,7 @@ def smartscraper( headers=headers, user_prompt=user_prompt, output_schema=output_schema, + number_of_scrolls=number_of_scrolls, ) logger.debug("✅ Request validation passed") diff --git a/scrapegraph-py/scrapegraph_py/models/smartscraper.py b/scrapegraph-py/scrapegraph_py/models/smartscraper.py index 21b346e..986144f 100644 --- a/scrapegraph-py/scrapegraph_py/models/smartscraper.py +++ b/scrapegraph-py/scrapegraph_py/models/smartscraper.py @@ -4,7 +4,7 @@ from uuid import UUID from bs4 import BeautifulSoup -from pydantic import BaseModel, Field, model_validator +from pydantic import BaseModel, Field, model_validator, conint class SmartScraperRequest(BaseModel): @@ -29,6 +29,11 @@ class SmartScraperRequest(BaseModel): description="Optional headers to send with the request, including cookies and user agent", ) output_schema: Optional[Type[BaseModel]] = None + number_of_scrolls: Optional[conint(ge=0, le=100)] = Field( + default=None, + description="Number of times to scroll the page (0-100). If None, no scrolling will be performed.", + example=10 + ) @model_validator(mode="after") def validate_user_prompt(self) -> "SmartScraperRequest": diff --git a/scrapegraph-py/uv.lock b/scrapegraph-py/uv.lock index c250817..290ee64 100644 --- a/scrapegraph-py/uv.lock +++ b/scrapegraph-py/uv.lock @@ -1525,7 +1525,7 @@ dev = [ [package.metadata] requires-dist = [ - { name = "aiohttp", specifier = ">=3.11.8" }, + { name = "aiohttp", specifier = ">=3.10" }, { name = "beautifulsoup4", specifier = ">=4.12.3" }, { name = "furo", marker = "extra == 'docs'", specifier = "==2024.5.6" }, { name = "pydantic", specifier = ">=2.10.2" },