Skip to content

feat: add infinite scrolling #38

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import asyncio
from typing import List, Dict, Any

from scrapegraph_py import AsyncClient
from scrapegraph_py.logger import sgai_logger

sgai_logger.set_logging(level="INFO")


async def scrape_companies(client: AsyncClient, url: str, batch: str) -> None:
"""Scrape companies from a specific YC batch with infinite scroll."""
try:
# Initial scrape with infinite scroll enabled
response = await client.smartscraper(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

try block should be a linear only

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

update it

website_url=url,
user_prompt="Extract all company information from this page, including name, description, and website",
number_of_scrolls=10,
)

# Process the results
companies = response.get("result", [])
if not companies:
print(f"No companies found for batch {batch}")
return

# Save or process the companies data
print(f"Found {len(companies)} companies in batch {batch}")
for company in companies:
print(f"Company: {company.get('name', 'N/A')}")
print(f"Description: {company.get('description', 'N/A')}")
print(f"Website: {company.get('website', 'N/A')}")
print("-" * 50)

except Exception as e:
print(f"Error scraping batch {batch}: {str(e)}")


async def main():
# Initialize async client
client = AsyncClient(api_key="your-api-key-here")

try:
# Example YC batch URLs
batch_urls = {
"W24": "https://www.ycombinator.com/companies?batch=W24",
"S23": "https://www.ycombinator.com/companies?batch=S23"
}

# Create tasks for each batch
tasks = [
scrape_companies(client, url, batch)
for batch, url in batch_urls.items()
]

# Execute all batch scraping concurrently
await asyncio.gather(*tasks)

finally:
# Ensure client is properly closed
await client.close()


if __name__ == "__main__":
asyncio.run(main())
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from scrapegraph_py import Client
from scrapegraph_py.logger import sgai_logger
from pydantic import BaseModel
from typing import List

sgai_logger.set_logging(level="INFO")

# Define the output schema
class Company(BaseModel):
name: str
category: str
location: str

class CompaniesResponse(BaseModel):
companies: List[Company]

# Initialize the client with explicit API key
sgai_client = Client(api_key="sgai-api-key")

try:
# SmartScraper request with infinite scroll
response = sgai_client.smartscraper(
website_url="https://www.ycombinator.com/companies?batch=Spring%202025",
user_prompt="Extract all company names and their categories from the page",
output_schema=CompaniesResponse,
number_of_scrolls=10 # Scroll 10 times to load more companies
)

# Print the response
print(f"Request ID: {response['request_id']}")

# Parse and print the results in a structured way
result = CompaniesResponse.model_validate(response['result'])
print("\nExtracted Companies:")
print("-" * 80)
for company in result.companies:
print(f"Name: {company.name}")
print(f"Category: {company.category}")
print(f"Location: {company.location}")
print("-" * 80)

except Exception as e:
print(f"An error occurred: {e}")

finally:
sgai_client.close()
4 changes: 4 additions & 0 deletions scrapegraph-py/scrapegraph_py/async_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ async def smartscraper(
website_html: Optional[str] = None,
headers: Optional[dict[str, str]] = None,
output_schema: Optional[BaseModel] = None,
number_of_scrolls: Optional[int] = None,
):
"""Send a smartscraper request"""
logger.info("🔍 Starting smartscraper request")
Expand All @@ -183,6 +184,8 @@ async def smartscraper(
logger.debug("📄 Using provided HTML content")
if headers:
logger.debug("🔧 Using custom headers")
if number_of_scrolls is not None:
logger.debug(f"🔄 Number of scrolls: {number_of_scrolls}")
logger.debug(f"📝 Prompt: {user_prompt}")

request = SmartScraperRequest(
Expand All @@ -191,6 +194,7 @@ async def smartscraper(
headers=headers,
user_prompt=user_prompt,
output_schema=output_schema,
number_of_scrolls=number_of_scrolls,
)
logger.debug("✅ Request validation passed")

Expand Down
4 changes: 4 additions & 0 deletions scrapegraph-py/scrapegraph_py/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ def smartscraper(
website_html: Optional[str] = None,
headers: Optional[dict[str, str]] = None,
output_schema: Optional[BaseModel] = None,
number_of_scrolls: Optional[int] = None,
):
"""Send a smartscraper request"""
logger.info("🔍 Starting smartscraper request")
Expand All @@ -191,6 +192,8 @@ def smartscraper(
logger.debug("📄 Using provided HTML content")
if headers:
logger.debug("🔧 Using custom headers")
if number_of_scrolls is not None:
logger.debug(f"🔄 Number of scrolls: {number_of_scrolls}")
logger.debug(f"📝 Prompt: {user_prompt}")

request = SmartScraperRequest(
Expand All @@ -199,6 +202,7 @@ def smartscraper(
headers=headers,
user_prompt=user_prompt,
output_schema=output_schema,
number_of_scrolls=number_of_scrolls,
)
logger.debug("✅ Request validation passed")

Expand Down
7 changes: 6 additions & 1 deletion scrapegraph-py/scrapegraph_py/models/smartscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from uuid import UUID

from bs4 import BeautifulSoup
from pydantic import BaseModel, Field, model_validator
from pydantic import BaseModel, Field, model_validator, conint


class SmartScraperRequest(BaseModel):
Expand All @@ -29,6 +29,11 @@ class SmartScraperRequest(BaseModel):
description="Optional headers to send with the request, including cookies and user agent",
)
output_schema: Optional[Type[BaseModel]] = None
number_of_scrolls: Optional[conint(ge=0, le=100)] = Field(
default=None,
description="Number of times to scroll the page (0-100). If None, no scrolling will be performed.",
example=10
)

@model_validator(mode="after")
def validate_user_prompt(self) -> "SmartScraperRequest":
Expand Down
2 changes: 1 addition & 1 deletion scrapegraph-py/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.