diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 9c24e48ae6..df63365882 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -8,7 +8,9 @@ "Bash(uv run:*)", "Bash(find:*)", "Bash(rg:*)", - "Bash(cargo check:*)" + "Bash(cargo check:*)", + "WebFetch(domain:gloochat.notion.site)", + "WebFetch(domain:www.boundaryml.com)" ], "deny": [] } diff --git a/fern/01-guide/09-comparisons/ai-sdk.mdx b/fern/01-guide/09-comparisons/ai-sdk.mdx new file mode 100644 index 0000000000..d0548649bb --- /dev/null +++ b/fern/01-guide/09-comparisons/ai-sdk.mdx @@ -0,0 +1,376 @@ +--- +title: Comparing AI SDK +--- + +[AI SDK](https://sdk.vercel.ai/) by Vercel is a powerful toolkit for building AI-powered applications in TypeScript. It's particularly popular for Next.js and React developers. + +Let's explore how AI SDK handles structured extraction and where the complexity creeps in. + +### Why working with LLMs requires more than just AI SDK + +AI SDK makes structured data generation look elegant at first: + +```typescript +import { generateObject } from 'ai'; +import { openai } from '@ai-sdk/openai'; +import { z } from 'zod'; + +const Resume = z.object({ + name: z.string(), + skills: z.array(z.string()) +}); + +const { object } = await generateObject({ + model: openai('gpt-4o'), + schema: Resume, + prompt: 'John Doe, Python, Rust' +}); +``` + +Clean and simple! But let's make it more realistic by adding education: + +```diff ++const Education = z.object({ ++ school: z.string(), ++ degree: z.string(), ++ year: z.number() ++}); + +const Resume = z.object({ + name: z.string(), + skills: z.array(z.string()), ++ education: z.array(Education) +}); + +const { object } = await generateObject({ + model: openai('gpt-4o'), + schema: Resume, + prompt: `John Doe +Python, Rust +University of California, Berkeley, B.S. in Computer Science, 2020` +}); +``` + +Still works! But... what's the actual prompt being sent? How many tokens is this costing? + +### The visibility problem + +Your manager asks: "Why did the extraction fail for this particular resume?" + +```typescript +// How do you debug what went wrong? +const { object } = await generateObject({ + model: openai('gpt-4o'), + schema: Resume, + prompt: complexResumeText +}); + +// You can't see: +// - The actual prompt sent to the model +// - The schema format used +// - Why certain fields were missed +``` + +You start digging through the AI SDK source code to understand the prompt construction... + +### Classification challenges + +Now your PM wants to classify resumes by seniority level: + +```typescript +const SeniorityLevel = z.enum(['junior', 'mid', 'senior', 'staff']); + +const Resume = z.object({ + name: z.string(), + skills: z.array(z.string()), + education: z.array(Education), + seniority: SeniorityLevel +}); +``` + +But wait... how do you tell the model what "junior" vs "senior" means? Zod enums are just string literals: + +```typescript +// You can't add descriptions to enum values! +// How does the model know junior = 0-2 years experience? + +// You try adding a comment... +const SeniorityLevel = z.enum([ + 'junior', // 0-2 years + 'mid', // 2-5 years + 'senior', // 5-10 years + 'staff' // 10+ years +]); +// But comments aren't sent to the model! + +// So you end up doing this hack: +const { object } = await generateObject({ + model: openai('gpt-4o'), + schema: Resume, + prompt: `Extract resume information. + +Seniority levels: +- junior: 0-2 years experience +- mid: 2-5 years experience +- senior: 5-10 years experience +- staff: 10+ years experience + +Resume: +${resumeText}` +}); +``` + +Your clean abstraction is leaking... + +### Multi-provider pain + +Your company wants to use different models for different use cases: + +```typescript +// First, install a bunch of packages +npm install @ai-sdk/openai @ai-sdk/anthropic @ai-sdk/google @ai-sdk/mistral + +// Import from different packages +import { openai } from '@ai-sdk/openai'; +import { anthropic } from '@ai-sdk/anthropic'; +import { google } from '@ai-sdk/google'; + +// Now you need provider detection logic +function getModel(provider: string) { + switch(provider) { + case 'openai': return openai('gpt-4o'); + case 'anthropic': return anthropic('claude-3-opus-20240229'); + case 'google': return google('gemini-pro'); + // Don't forget to handle errors... + } +} + +// And manage different API keys +const providers = { + openai: process.env.OPENAI_API_KEY, + anthropic: process.env.ANTHROPIC_API_KEY, + google: process.env.GOOGLE_API_KEY, + // More environment variables to manage... +}; +``` + +### Testing without burning money + +You want to test your extraction logic: + +```typescript +// How do you test this without API calls? +const { object } = await generateObject({ + model: openai('gpt-4o'), + schema: Resume, + prompt: testResumeText +}); + +// Mock the entire AI SDK? +jest.mock('ai', () => ({ + generateObject: jest.fn().mockResolvedValue({ + object: { name: 'Test', skills: ['JS'] } + }) +})); + +// But you're not testing your schema or prompt... +// Just that your mocks return the right shape +``` + +### The real-world spiral + +As your app grows, you need: +- Custom extraction strategies for different document types +- Retry logic for flaky models +- Token usage tracking for cost control +- Prompt versioning for A/B testing + +Your code evolves into: + +```typescript +class ResumeExtractor { + private tokenCounter: TokenCounter; + private promptTemplates: Map; + private retryConfig: RetryConfig; + + async extract(text: string, options?: ExtractOptions) { + const model = this.selectModel(options); + const prompt = this.buildPrompt(text, options); + + return this.withRetry(async () => { + const start = Date.now(); + const tokens = this.tokenCounter.estimate(prompt); + + try { + const result = await generateObject({ + model, + schema: Resume, + prompt + }); + + this.logUsage({ tokens, duration: Date.now() - start }); + return result; + } catch (error) { + this.handleError(error); + } + }); + } + + // ... dozens more methods +} +``` + +The simple AI SDK call is now buried in layers of infrastructure code. + +## Enter BAML + +BAML was designed for the reality of production LLM applications. Here's the same resume extraction: + +```baml +class Education { + school string + degree string + year int +} + +enum SeniorityLevel { + JUNIOR @description("0-2 years of experience") + MID @description("2-5 years of experience") + SENIOR @description("5-10 years of experience") + STAFF @description("10+ years of experience, technical leadership") +} + +class Resume { + name string + skills string[] + education Education[] + seniority SeniorityLevel +} + +function ExtractResume(resume_text: string) -> Resume { + client GPT4 + prompt #" + Extract the following information from the resume. + + Pay attention to the seniority descriptions: + {{ ctx.output_format.seniority }} + + Resume: + --- + {{ resume_text }} + --- + + {{ ctx.output_format }} + "# +} +``` + +Notice what you get immediately: +1. **The prompt is right there** - No digging through source code +2. **Enums with descriptions** - The model knows what each value means +3. **Type definitions that become prompts** - Less tokens, clearer instructions + +### Multi-model made simple + +```baml +// All providers in one place +client GPT4 { + provider openai + options { + model "gpt-4o" + temperature 0.1 + } +} + +client Claude { + provider anthropic + options { + model "claude-3-opus-20240229" + temperature 0.1 + } +} + +client Gemini { + provider google + options { + model "gemini-pro" + } +} + +client Llama { + provider ollama + options { + model "llama3" + } +} + +// Same function, any model +function ExtractResume(resume_text: string) -> Resume { + client GPT4 // Just change this + prompt #"..."# +} +``` + +Use it in TypeScript: +```typescript +import { baml } from '@/baml_client'; + +// Use default model +const resume = await baml.ExtractResume(resumeText); + +// Switch models based on your needs +const complexResume = await baml.ExtractResume(complexText, { client: "Claude" }); +const simpleResume = await baml.ExtractResume(simpleText, { client: "Llama" }); + +// Everything is fully typed! +console.log(resume.seniority); // TypeScript knows this is SeniorityLevel +``` + +### Testing that actually tests + +With BAML's VSCode extension, you can: + +BAML development tools in VSCode + +1. **Test prompts without API calls** - Instant feedback +2. **See exactly what will be sent** - Full transparency +3. **Iterate on prompts instantly** - No deploy cycles +4. **Save test cases** for regression testing + +BAML code lens showing test options + +*No mocking required - you're testing the actual prompt and parsing logic.* + +### The bottom line + +AI SDK is fantastic for building streaming AI applications in Next.js. But for structured extraction, you end up fighting the abstractions. + +**BAML's advantages over AI SDK:** +- **Prompt transparency** - See and control exactly what's sent to the LLM +- **Purpose-built types** - Enums with descriptions, aliases, better schema format +- **Unified model interface** - All providers work the same way, switch with one line +- **Real testing** - Test in VSCode without API calls or burning tokens +- **Schema-Aligned Parsing** - Get structured outputs from any model +- **Better token efficiency** - Optimized schema format uses fewer tokens +- **Production features** - Built-in retries, fallbacks, and error handling + +**What this means for your TypeScript apps:** +- **Faster development** - Test prompts instantly without running Next.js +- **Better debugging** - Know exactly why extraction failed +- **Cost optimization** - See token usage and optimize prompts +- **Model flexibility** - Never get locked into one provider +- **Cleaner code** - No wrapper classes or infrastructure code needed + +**AI SDK is great for:** Streaming UI, Next.js integration, rapid prototyping +**BAML is great for:** Production structured extraction, multi-model apps, cost optimization + +We built BAML because we were tired of elegant APIs that fall apart when you need production reliability and control. + +### Limitations of BAML + +BAML does have some limitations: +1. It's a new language (but learning takes < 10 minutes) +2. Best experience requires VSCode +3. Focused on structured extraction, not general AI features + +If you're building a Next.js app with streaming UI, use AI SDK. If you want bulletproof structured extraction with full control, [try BAML](https://docs.boundaryml.com). \ No newline at end of file diff --git a/fern/01-guide/09-comparisons/langchain.mdx b/fern/01-guide/09-comparisons/langchain.mdx index 6d14055ba0..d42060d883 100644 --- a/fern/01-guide/09-comparisons/langchain.mdx +++ b/fern/01-guide/09-comparisons/langchain.mdx @@ -1,10 +1,303 @@ --- title: Comparing Langchain -slug: docs/comparisons/langchain --- +[Langchain](https://github.com/langchain-ai/langchain) is one of the most popular frameworks for building LLM applications. It provides abstractions for chains, agents, memory, and more. -[Langchain](https://langchain.com) is a toolkit that helps developers build AI applications. +Let's dive into how Langchain handles structured extraction and where it falls short. -### The LCEL +### Why working with LLMs requires more than just Langchain +Langchain makes structured extraction look simple at first: + +```python +from pydantic import BaseModel, Field +from langchain_openai import ChatOpenAI + +class Resume(BaseModel): + name: str + skills: List[str] + +llm = ChatOpenAI(model="gpt-4o") +structured_llm = llm.with_structured_output(Resume) +result = structured_llm.invoke("John Doe, Python, Rust") +``` + +That's pretty neat! But now let's add an `Education` model to make it more realistic: + +```diff ++class Education(BaseModel): ++ school: str ++ degree: str ++ year: int + +class Resume(BaseModel): + name: str + skills: List[str] ++ education: List[Education] + +structured_llm = llm.with_structured_output(Resume) +result = structured_llm.invoke("""John Doe +Python, Rust +University of California, Berkeley, B.S. in Computer Science, 2020""") +``` + +Still works... but what's actually happening under the hood? What prompt is being sent? How many tokens are we using? + +Let's dig deeper. Say you want to see what's actually being sent to the model: + +```python +# How do you debug this? +structured_llm = llm.with_structured_output(Resume) + +# You need to enable verbose mode or dig into callbacks +from langchain.globals import set_debug +set_debug(True) + +# Now you get TONS of debug output... +``` + +But even with debug mode, you still can't easily: +- Modify the extraction prompt +- See the exact token count +- Understand why extraction failed for certain inputs + +### When things go wrong + +Here's where it gets tricky. Your PM asks: "Can we classify these resumes by seniority level?" + +```python +from enum import Enum + +class SeniorityLevel(str, Enum): + JUNIOR = "junior" + MID = "mid" + SENIOR = "senior" + STAFF = "staff" + +class Resume(BaseModel): + name: str + skills: List[str] + education: List[Education] + seniority: SeniorityLevel +``` + +But now you realize you need to give the LLM context about what each level means: + +```python +# Wait... how do I tell the LLM that "junior" means 0-2 years experience? +# How do I customize the prompt? + +# You end up doing this: +CLASSIFICATION_PROMPT = """ +Given the resume below, classify the seniority level: +- junior: 0-2 years experience +- mid: 2-5 years experience +- senior: 5-10 years experience +- staff: 10+ years experience + +Resume: {resume_text} +""" + +# Now you need separate chains... +classification_chain = LLMChain(llm=llm, prompt=PromptTemplate.from_template(CLASSIFICATION_PROMPT)) +extraction_chain = llm.with_structured_output(Resume) + +# And combine them somehow... +``` + +Your clean code is starting to look messy. But wait, there's more! + +### Multi-model madness + +Your company wants to use Claude for some tasks (better reasoning) and GPT-4-mini for others (cost savings). With Langchain: + +```python +from langchain_anthropic import ChatAnthropic +from langchain_openai import ChatOpenAI + +# Different providers, different imports +claude = ChatAnthropic(model="claude-3-opus-20240229") +gpt4 = ChatOpenAI(model="gpt-4o") +gpt4_mini = ChatOpenAI(model="gpt-4o-mini") + +# But wait... does Claude support structured outputs the same way? +claude_structured = claude.with_structured_output(Resume) # May not work! + +# You need provider-specific handling +if provider == "anthropic": + # Use function calling? XML? JSON mode? + # Different providers have different capabilities + pass +``` + +### Testing nightmare + +Now you want to test your extraction logic without burning through API credits: + +```python +# How do you test this? +structured_llm = llm.with_structured_output(Resume) + +# Mock the entire LLM? +from unittest.mock import Mock +mock_llm = Mock() +mock_llm.with_structured_output.return_value.invoke.return_value = Resume(...) + +# But you're not really testing your extraction logic... +# Just that your mocks work +``` + +**With BAML, testing is visual and instant:** + +VSCode test case buttons for instant testing + +*Test your prompts instantly without API calls or mocking* + +### The token mystery + +Your CFO asks: "Why is our OpenAI bill so high?" You investigate: + +```python +# How many tokens does this use? +structured_llm = llm.with_structured_output(Resume) +result = structured_llm.invoke(long_resume_text) + +# You need callbacks or token counting utilities +from langchain.callbacks import get_openai_callback + +with get_openai_callback() as cb: + result = structured_llm.invoke(long_resume_text) + print(f"Tokens: {cb.total_tokens}") # Finally! +``` + +But you still don't know WHY it's using so many tokens. Is it the schema format? The prompt template? The retry logic? + +## Enter BAML + +BAML was built specifically for these LLM challenges. Here's the same resume extraction: + +```baml +class Education { + school string + degree string + year int +} + +class Resume { + name string + skills string[] + education Education[] + seniority SeniorityLevel +} + +enum SeniorityLevel { + JUNIOR @description("0-2 years of experience") + MID @description("2-5 years of experience") + SENIOR @description("5-10 years of experience") + STAFF @description("10+ years of experience, technical leadership") +} + +function ExtractResume(resume_text: string) -> Resume { + client GPT4 + prompt #" + Extract information from this resume. + + For seniority level, consider: + {{ ctx.output_format.seniority }} + + Resume: + --- + {{ resume_text }} + --- + + {{ ctx.output_format }} + "# +} +``` + +Now look what you get: + +1. **See exactly what's sent to the LLM** - The prompt is right there! +2. **Test without API calls** - Use the VSCode playground +3. **Switch models instantly** - Just change `client GPT4` to `client Claude` +4. **Token count visibility** - BAML shows exact token usage +5. **Modify prompts easily** - It's just a template string + +### Multi-model support done right + +```baml +// Define all your clients in one place +client GPT4 { + provider openai + options { + model "gpt-4o" + temperature 0.1 + } +} + +client GPT4Mini { + provider openai + options { + model "gpt-4o-mini" + temperature 0.1 + } +} + +client Claude { + provider anthropic + options { + model "claude-3-opus-20240229" + max_tokens 4096 + } +} + +// Same function works with ANY model +function ExtractResume(resume_text: string) -> Resume { + client GPT4 // Just change this line + prompt #"..."# +} +``` + +Use it in Python: +```python +from baml_client import baml as b + +# Use default model +resume = await b.ExtractResume(resume_text) + +# Override at runtime based on your needs +resume_complex = await b.ExtractResume(complex_text, {"client": "Claude"}) +resume_simple = await b.ExtractResume(simple_text, {"client": "GPT4Mini"}) +``` + +### The bottom line + +Langchain is great for building complex LLM applications with chains, agents, and memory. But for structured extraction, you're fighting against abstractions that hide important details. + +**BAML gives you what Langchain can't:** +- **Full prompt transparency** - See and control exactly what's sent to the LLM +- **Native testing** - Test in VSCode without API calls or burning tokens +- **Multi-model by design** - Switch providers with one line, works with any model +- **Token visibility** - Know exactly what you're paying for and optimize costs +- **Type safety** - Generated clients with autocomplete that always match your schema +- **Schema-Aligned Parsing** - Get structured outputs from any model, even without function calling +- **Streaming + Structure** - Stream structured data with loading bars and type-safe parsing + +**Why this matters for production:** +- **Faster iteration** - See changes instantly without running Python code +- **Better debugging** - Know exactly why extraction failed +- **Cost optimization** - Understand and reduce token usage +- **Model flexibility** - Never get locked into one provider +- **Team collaboration** - Prompts are code, not hidden strings + +We built BAML because we were tired of wrestling with framework abstractions when all we wanted was reliable structured extraction with full developer control. + +### Limitations of BAML + +BAML does have some limitations we are continuously working on: +1. It is a new language. However, it is fully open source and getting started takes less than 10 minutes +2. Developing requires VSCode. You _could_ use vim but we don't recommend it +3. It's focused on structured extraction - not a full LLM framework like Langchain + +If you need complex chains and agents, use Langchain. If you want the best structured extraction experience with full control, [try BAML](https://docs.boundaryml.com). \ No newline at end of file diff --git a/fern/01-guide/09-comparisons/marvin.mdx b/fern/01-guide/09-comparisons/marvin.mdx index de323c9884..16310138e3 100644 --- a/fern/01-guide/09-comparisons/marvin.mdx +++ b/fern/01-guide/09-comparisons/marvin.mdx @@ -106,18 +106,30 @@ requestType = await b.ClassifyRequest("Reset my password") assert requestType == RequestType.ACCOUNT ``` -The prompt string may be more wordy, but with BAML you now have -1. Fully typed responses, guaranteed -1. Full transparency and flexibility of the prompt string -1. Full freedom for what model to use -1. Helper functions to manipulate types in prompts (print_enum) -1. Testing capabilities using the VSCode playground -1. Analytics in the Boundary Dashboard -1. Support for TypeScript -1. A better understanding of how prompt engineering works - - -Marvin was a big source of inspiration for us -- their approach is simple and elegant. We recommend checking out Marvin if you're just starting out with prompt engineering or want to do a one-off simple task in Python. But if you'd like a whole added set of features, we'd love for you to give BAML a try and let us know what you think. +### The bottom line + +Marvin was a big source of inspiration for us -- their approach is simple and elegant for quick Python prototypes. + +**BAML's advantages over Marvin:** +- **Prompt transparency** - See and control exactly what's sent to the LLM +- **Multi-language support** - Python, TypeScript, Java, Go, not just Python +- **Model flexibility** - Use any provider (OpenAI, Claude, Gemini, open-source) +- **Real testing** - Test in VSCode without API calls or burning tokens +- **Production features** - Built-in retries, fallbacks, streaming, error handling +- **Better type system** - Enums with descriptions, aliases, complex nested types +- **Cost optimization** - See token usage and optimize prompts + +**What this means for your applications:** +- **Faster development** - Test and iterate on prompts instantly +- **Better reliability** - Handle edge cases and model failures automatically +- **Multi-language teams** - Same logic works in Python, TypeScript, and more +- **Production readiness** - Built-in observability and error handling +- **Model independence** - Never get locked into one provider + +**Marvin is great for:** Quick Python prototypes, simple one-off tasks +**BAML is great for:** Production applications, multi-language teams, complex workflows + +We recommend checking out Marvin if you're just starting with prompt engineering or need a quick Python solution. But if you're building production applications that need reliability, observability, and multi-language support, [try BAML](https://docs.boundaryml.com). ### Limitations of BAML diff --git a/fern/01-guide/09-comparisons/openai-sdk.mdx b/fern/01-guide/09-comparisons/openai-sdk.mdx new file mode 100644 index 0000000000..0d613d50ff --- /dev/null +++ b/fern/01-guide/09-comparisons/openai-sdk.mdx @@ -0,0 +1,426 @@ +--- +title: Comparing OpenAI SDK +--- + +[OpenAI SDK](https://github.com/openai/openai-python) now supports structured outputs natively, making it easier than ever to get typed responses from GPT models. + +Let's explore how this works in practice and where you might hit limitations. + +### Why working with LLMs requires more than just OpenAI SDK + +OpenAI's structured outputs look fantastic at first: + +```python +from pydantic import BaseModel +from openai import OpenAI + +class Resume(BaseModel): + name: str + skills: list[str] + +client = OpenAI() +completion = client.beta.chat.completions.parse( + model="gpt-4o", + messages=[ + {"role": "user", "content": "John Doe, Python, Rust"} + ], + response_format=Resume, +) +resume = completion.choices[0].message.parsed +``` + +Simple and type-safe! Let's add education to make it more realistic: + +```diff ++class Education(BaseModel): ++ school: str ++ degree: str ++ year: int + +class Resume(BaseModel): + name: str + skills: list[str] ++ education: list[Education] + +completion = client.beta.chat.completions.parse( + model="gpt-4o", + messages=[ + {"role": "user", "content": """John Doe +Python, Rust +University of California, Berkeley, B.S. in Computer Science, 2020"""} + ], + response_format=Resume, +) +``` + +Still works! But let's dig deeper... + +### The prompt mystery + +Your extraction works 90% of the time, but fails on certain resumes. You need to debug: + +```python +# What prompt is actually being sent? +completion = client.beta.chat.completions.parse( + model="gpt-4o", + messages=[{"role": "user", "content": resume_text}], + response_format=Resume, +) + +# You can't see: +# - How the schema is formatted +# - What instructions the model receives +# - Why certain fields are misunderstood +``` + +You start experimenting with system messages: + +```python +completion = client.beta.chat.completions.parse( + model="gpt-4o", + messages=[ + {"role": "system", "content": "Extract resume information accurately."}, + {"role": "user", "content": resume_text} + ], + response_format=Resume, +) + +# But what if you need more specific instructions? +# How do you tell it to handle edge cases? +``` + +### Classification without context + +Now you need to classify resumes by seniority: + +```python +from enum import Enum + +class SeniorityLevel(str, Enum): + JUNIOR = "junior" + MID = "mid" + SENIOR = "senior" + STAFF = "staff" + +class Resume(BaseModel): + name: str + skills: list[str] + education: list[Education] + seniority: SeniorityLevel +``` + +But the model doesn't know what these levels mean! You try adding a docstring: + +```python +class Resume(BaseModel): + """Resume with seniority classification. + + Seniority levels: + - junior: 0-2 years experience + - mid: 2-5 years experience + - senior: 5-10 years experience + - staff: 10+ years experience + """ + name: str + skills: list[str] + education: list[Education] + seniority: SeniorityLevel +``` + +But docstrings aren't sent to the model. So you resort to prompt engineering: + +```python +messages = [ + {"role": "system", "content": """Extract resume information. + +Classify seniority as: +- junior: 0-2 years experience +- mid: 2-5 years experience +- senior: 5-10 years experience +- staff: 10+ years experience"""}, + {"role": "user", "content": resume_text} +] +``` + +Now your business logic is split between types and prompts... + +### The vendor lock-in problem + +Your team wants to experiment with Claude for better reasoning: + +```python +# With OpenAI SDK, you're stuck with OpenAI +from openai import OpenAI +client = OpenAI() + +# Want to try Claude? Start over with a different SDK +from anthropic import Anthropic +anthropic_client = Anthropic() + +# Completely different API +message = anthropic_client.messages.create( + model="claude-3-opus-20240229", + messages=[{"role": "user", "content": resume_text}], + # No structured outputs support! +) + +# Now you need custom parsing +import json +resume_data = json.loads(message.content) +resume = Resume(**resume_data) # Hope it matches! +``` + +### Testing and token tracking + +You want to test your extraction and track costs: + +```python +# How do you test without burning tokens? +def test_resume_extraction(): + completion = client.beta.chat.completions.parse( + model="gpt-4o", + messages=[{"role": "user", "content": test_resume}], + response_format=Resume, + ) + # This costs money every time! + +# Mock the OpenAI client? +from unittest.mock import Mock +mock_client = Mock() +mock_client.beta.chat.completions.parse.return_value = ... +# You're not really testing the extraction logic + +# Track token usage? +completion = client.beta.chat.completions.parse(...) +print(completion.usage.total_tokens) # At least this exists! + +# But how many tokens does the schema formatting use? +# Could you optimize it? +``` + +### Production complexity creep + +As your app scales, you need: +- Retry logic for rate limits +- Fallback to GPT-3.5 when GPT-4 is down +- A/B testing different prompts +- Structured logging for debugging + +Your code evolves: + +```python +class ResumeExtractor: + def __init__(self): + self.client = OpenAI() + self.fallback_client = OpenAI() # Different API key? + + def extract_with_retries(self, text: str, max_retries: int = 3): + for attempt in range(max_retries): + try: + return self._extract(text, model="gpt-4o") + except RateLimitError: + if attempt == max_retries - 1: + # Try fallback model + return self._extract(text, model="gpt-3.5-turbo") + time.sleep(2 ** attempt) + + def _extract(self, text: str, model: str): + messages = self._build_messages(text) + + completion = self.client.beta.chat.completions.parse( + model=model, + messages=messages, + response_format=Resume, + ) + + self._log_usage(completion, model) + return completion.choices[0].message.parsed + + # ... more infrastructure code +``` + +The simple API is now buried in error handling and logging. + +## Enter BAML + +BAML was built for real-world LLM applications. Here's the same resume extraction: + +```baml +class Education { + school string + degree string + year int +} + +enum SeniorityLevel { + JUNIOR @description("0-2 years of experience") + MID @description("2-5 years of experience") + SENIOR @description("5-10 years of experience") + STAFF @description("10+ years of experience, technical leadership") +} + +class Resume { + name string + skills string[] + education Education[] + seniority SeniorityLevel +} + +function ExtractResume(resume_text: string) -> Resume { + client GPT4 + prompt #" + Extract structured information from this resume. + + When determining seniority, use these guidelines: + {{ ctx.output_format.seniority }} + + Resume: + --- + {{ resume_text }} + --- + + Output format: + {{ ctx.output_format }} + "# +} +``` + +See the difference? +1. **The prompt is explicit** - No guessing what's sent +2. **Enums have descriptions** - Built into the type system +3. **One place for everything** - Types and prompts together + +### Multi-model freedom + +```baml +// Define all your models +client GPT4 { + provider openai + options { + model "gpt-4o" + temperature 0.1 + } +} + +client GPT35 { + provider openai + options { + model "gpt-3.5-turbo" + temperature 0.1 + } +} + +client Claude { + provider anthropic + options { + model "claude-3-opus-20240229" + } +} + +client Llama { + provider ollama + options { + model "llama3" + } +} + +// Use ANY model with the SAME function +function ExtractResume(resume_text: string) -> Resume { + client GPT4 // Just change this line! + prompt #"..."# +} +``` + +In Python: +```python +from baml_client import baml as b + +# Default model +resume = await b.ExtractResume(resume_text) + +# Use different models for different scenarios +cheap_extraction = await b.ExtractResume(simple_text, {"client": "GPT35"}) +quality_extraction = await b.ExtractResume(complex_text, {"client": "Claude"}) +private_extraction = await b.ExtractResume(sensitive_text, {"client": "Llama"}) + +# Same interface, same types, different models! +``` + +### Testing without burning money + +With BAML's VSCode extension: + +BAML VSCode playground with instant testing + +1. **Write your test cases** - Visual interface for test data +2. **See the exact prompt** - No hidden abstractions +3. **Test instantly** without API calls +4. **Iterate until perfect** - Instant feedback loop +5. **Save test cases** for CI/CD + +Opening BAML playground from VSCode + +*No mocking, no token costs, real testing.* + +### Built for production + +```baml +// Retry configuration +client GPT4WithRetries { + provider openai + options { + model "gpt-4o" + temperature 0.1 + } + retry_policy { + max_retries 3 + strategy exponential_backoff + } +} + +// Fallback chains +client SmartRouter { + provider fallback + options { + clients ["GPT4", "Claude", "GPT35"] + } +} +``` + +All the production concerns handled declaratively. + +### The bottom line + +OpenAI's structured outputs are great if you: +- Only use OpenAI models +- Don't need prompt customization +- Have simple extraction needs + +**But production LLM applications need more:** + +**BAML's advantages over OpenAI SDK:** +- **Model flexibility** - Works with GPT, Claude, Gemini, Llama, and any future model +- **Prompt transparency** - See and optimize exactly what's sent to the LLM +- **Real testing** - Test in VSCode without burning tokens or API calls +- **Production features** - Built-in retries, fallbacks, and smart routing +- **Cost optimization** - Understand token usage and optimize prompts +- **Schema-Aligned Parsing** - Get structured outputs from any model, not just OpenAI +- **Streaming + Structure** - Stream structured data with loading bars + +**Why this matters:** +- **Future-proof** - Never get locked into one model provider +- **Faster development** - Instant testing and iteration in your editor +- **Better reliability** - Built-in error handling and fallback strategies +- **Team productivity** - Prompts are versioned, testable code +- **Cost control** - Optimize token usage across different models + +With BAML, you get all the benefits of OpenAI's structured outputs plus the flexibility and control needed for production applications. + +### Limitations of BAML + +BAML has some limitations: +1. It's a new language (though easy to learn) +2. Best experience needs VSCode +3. Focused on structured extraction + +If you're building a simple OpenAI-only prototype, the OpenAI SDK is fine. If you're building production LLM features that need to scale, [try BAML](https://docs.boundaryml.com). \ No newline at end of file diff --git a/fern/01-guide/09-comparisons/pydantic.mdx b/fern/01-guide/09-comparisons/pydantic.mdx index 1218cc462e..8501651983 100644 --- a/fern/01-guide/09-comparisons/pydantic.mdx +++ b/fern/01-guide/09-comparisons/pydantic.mdx @@ -400,10 +400,37 @@ In this image we change the types and BAML automatically updates the prompt, par Adding retries or resilience requires just [a couple of modifications](/ref/llm-client-strategies/retry-policy). And best of all, **you can test things instantly, without leaving your VSCode**. +### The bottom line + +Pydantic is excellent for data validation, but LLM applications need more than validation - they need a complete structured extraction solution. + +**BAML's advantages over Pydantic:** +- **No boilerplate** - BAML generates all parsing, retry, and error handling code +- **Visual development** - See prompts and test instantly in VSCode +- **Better prompts** - Optimized schema format uses 80% fewer tokens +- **Schema-Aligned Parsing** - Handles malformed JSON and edge cases automatically +- **Multi-model support** - Works with any LLM provider, not just OpenAI +- **Type safety across languages** - Generated clients for Python, TypeScript, Java, Go +- **Built-in resilience** - Retries, fallbacks, and smart error recovery + +**What you get with BAML that Pydantic can't provide:** +- **Instant testing** - No API calls or token costs during development +- **Prompt optimization** - See exactly what's sent and optimize token usage +- **Production features** - Automatic retries, model fallbacks, streaming support +- **Better debugging** - Know exactly why extraction failed +- **Future-proof** - Never get locked into one model or provider + +**Why this matters for your team:** +- **10x faster iteration** - Test prompts instantly without running Python code +- **Better reliability** - Handle edge cases and malformed outputs automatically +- **Cost optimization** - Reduce token usage with optimized schema formats +- **Model flexibility** - Switch between GPT, Claude, open-source models seamlessly + +We built BAML because writing a Python library wasn't powerful enough to solve the real challenges of LLM structured extraction. + ### Conclusion -We built BAML because writing a Python library was just not powerful enough to do everything we envisioned, as we have just explored. -Get started today with [python](/guide/installation-language/python). +Get started today with [Python](/guide/installation-language/python), [TypeScript](/guide/installation-language/typescript), or [other languages](/guide/installation-language/rest). -Our mission is to make the best DX for AI engineers working with LLMs. Contact us at founders@boundaryml.com or [Join us on Discord](https://discord.gg/BTNBeXGuaS) to stay in touch with the community and influence the roadmap. +Our mission is to make the best developer experience for AI engineers working with LLMs. Contact us at founders@boundaryml.com or [Join us on Discord](https://discord.gg/BTNBeXGuaS) to stay in touch with the community and influence the roadmap. diff --git a/fern/01-guide/why-baml.mdx b/fern/01-guide/why-baml.mdx new file mode 100644 index 0000000000..718ac08c3e --- /dev/null +++ b/fern/01-guide/why-baml.mdx @@ -0,0 +1,556 @@ +--- +title: Why BAML? +description: The journey from simple LLM calls to production-ready structured extraction +--- + +Let's say you want to extract structured data from resumes. It starts simple enough... + +But first, let's see where we're going with this story: + + + +*BAML: What it is and how it helps - see the full developer experience* + +## It starts simple + +You begin with a basic LLM call to extract a name and skills: + +```python +import openai + +def extract_resume(text): + response = openai.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": f"Extract name and skills from: {text}"}] + ) + return response.choices[0].message.content +``` + +This works... sometimes. But you need structured data, not free text. + +## You need structure + +So you try JSON mode and add Pydantic for validation: + +```python +from pydantic import BaseModel +import json + +class Resume(BaseModel): + name: str + skills: list[str] + +def extract_resume(text): + prompt = f"""Extract resume data as JSON: +{text} + +Return JSON with fields: name (string), skills (array of strings)""" + + response = openai.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": prompt}], + response_format={"type": "json_object"} + ) + + data = json.loads(response.choices[0].message.content) + return Resume(**data) +``` + +Better! But now you need more fields. You add education, experience, and location: + +```python +class Education(BaseModel): + school: str + degree: str + year: int + +class Resume(BaseModel): + name: str + skills: list[str] + education: list[Education] + location: str + years_experience: int +``` + +The prompt gets longer and more complex. But wait - how do you test this without burning tokens? + +## Testing becomes expensive + +Every test costs money and takes time: + +```python +# This burns tokens every time you run tests! +def test_resume_extraction(): + test_resume = "John Doe, Python expert, MIT 2020..." + result = extract_resume(test_resume) # API call = $$$ + assert result.name == "John Doe" +``` + +You try mocking, but then you're not testing your actual extraction logic. Your prompt could be completely broken and tests would still pass. + +## Error handling nightmare + +Real resumes break your extraction. The LLM returns malformed JSON: + +Resume extraction error in traditional approach + +```json +{ + "name": "John Doe", + "skills": ["Python", "JavaScript" + // Missing closing bracket! +``` + +You add retry logic, JSON fixing, error handling: + +```python +import re +import time + +def extract_resume(text, max_retries=3): + for attempt in range(max_retries): + try: + response = openai.chat.completions.create(...) + content = response.choices[0].message.content + + # Try to fix common JSON issues + content = fix_json(content) + + data = json.loads(content) + return Resume(**data) + except (json.JSONDecodeError, ValidationError) as e: + if attempt == max_retries - 1: + raise + time.sleep(2 ** attempt) # Exponential backoff + +def fix_json(content): + # Remove text before/after JSON + json_match = re.search(r'\{.*\}', content, re.DOTALL) + if json_match: + content = json_match.group(0) + + # Fix common issues + content = content.replace(',}', '}') + content = content.replace(',]', ']') + # ... more fixes + + return content +``` + +Your simple extraction function is now 50+ lines of infrastructure code. + +## Multi-model chaos + +Your company wants to use Claude for some tasks (better reasoning) and GPT-4-mini for others (cost savings): + +```python +def extract_resume(text, provider="openai", model="gpt-4o"): + if provider == "openai": + import openai + client = openai.OpenAI() + response = client.chat.completions.create(model=model, ...) + elif provider == "anthropic": + import anthropic + client = anthropic.Anthropic() + # Different API! Need to rewrite everything + response = client.messages.create(model=model, ...) + # ... handle different response formats +``` + +Each provider has different APIs, different response formats, different capabilities. Your code becomes a mess of if/else statements. + +## The prompt mystery + +Your extraction fails on certain resumes. You need to debug, but what was actually sent to the LLM? + +```python +# What prompt was generated? How many tokens did it use? +# Why did this specific resume fail? +# How do I optimize for cost? + +# You can't easily see: +# - The exact prompt that was sent +# - How the schema was formatted +# - Token usage breakdown +# - Why specific fields were missed +``` + +You start adding logging, token counting, prompt inspection tools... + +## Classification gets complex + +Now you need to classify seniority levels: + +```python +from enum import Enum + +class SeniorityLevel(str, Enum): + JUNIOR = "junior" + MID = "mid" + SENIOR = "senior" + STAFF = "staff" + +class Resume(BaseModel): + name: str + skills: list[str] + education: list[Education] + seniority: SeniorityLevel +``` + +But the LLM doesn't know what these levels mean! You update the prompt: + +```python +prompt = f"""Extract resume data as JSON: + +Seniority levels: +- junior: 0-2 years experience +- mid: 2-5 years experience +- senior: 5-10 years experience +- staff: 10+ years experience + +{text} + +Return JSON with fields: name, skills, education, seniority...""" +``` + +Your prompt is getting huge and your business logic is scattered between code and strings. + +## Production deployment headaches + +In production, you need: +- Retry policies for rate limits +- Fallback models when primary is down +- Cost tracking and optimization +- Error monitoring and alerting +- A/B testing different prompts + +Your simple extraction function becomes a complex service: + +```python +class ResumeExtractor: + def __init__(self): + self.primary_client = openai.OpenAI() + self.fallback_client = anthropic.Anthropic() + self.token_tracker = TokenTracker() + self.error_monitor = ErrorMonitor() + + async def extract_with_fallback(self, text): + try: + return await self._extract_openai(text) + except RateLimitError: + return await self._extract_anthropic(text) + except Exception as e: + self.error_monitor.log(e) + raise + + def _extract_openai(self, text): + # 50+ lines of OpenAI-specific logic + pass + + def _extract_anthropic(self, text): + # 50+ lines of Anthropic-specific logic + pass +``` + +## Enter BAML + +What if you could go back to something simple, but keep all the power? + +```baml +class Education { + school string + degree string + year int +} + +enum SeniorityLevel { + JUNIOR @description("0-2 years of experience") + MID @description("2-5 years of experience") + SENIOR @description("5-10 years of experience") + STAFF @description("10+ years of experience, technical leadership") +} + +class Resume { + name string + skills string[] + education Education[] + seniority SeniorityLevel +} + +function ExtractResume(resume_text: string) -> Resume { + client GPT4 + prompt #" + Extract information from this resume. + + For seniority level, consider: + {{ ctx.output_format.seniority }} + + Resume: + --- + {{ resume_text }} + --- + + {{ ctx.output_format }} + "# +} +``` + +Look what you get immediately: + +BAML playground working with resume extraction + +*BAML playground showing successful resume extraction with clear prompts and structured output* + +### 1. **Instant Testing** +Test in VSCode playground without API calls or token costs: + +VSCode playground showing resume extraction with prompt preview + +- **See the exact prompt** that will be sent to the LLM +- **Test with real data instantly** - no API calls needed +- **Save test cases** for regression testing +- **Visual prompt preview** shows token usage and formatting + +VSCode test cases interface + +*Build up a library of test cases that run instantly* + +### 2. **Multi-Model Made Simple** +```baml +client GPT4 { + provider openai + options { model "gpt-4o" } +} + +client Claude { + provider anthropic + options { model "claude-3-opus-20240229" } +} + +client GPT4Mini { + provider openai + options { model "gpt-4o-mini" } +} + +// Same function, any model - just change the client +function ExtractResume(resume_text: string) -> Resume { + client GPT4 // Switch to Claude or GPT4Mini with one line + prompt #"..."# +} +``` + +### 3. **Schema-Aligned Parsing (SAP)** +BAML's breakthrough innovation follows Postel's Law: *"Be conservative in what you do, be liberal in what you accept from others."* + +Instead of rejecting imperfect outputs, SAP actively transforms them to match your schema using custom edit distance algorithms. + + + + +**SAP vs Other Approaches:** + +| Model | Function Calling | Python AST Parser | **SAP** | +|-------|------------------|-------------------|---------| +| gpt-3.5-turbo | 87.5% | 75.8% | **92%** | +| gpt-4o | 87.4% | 82.1% | **93%** | +| claude-3-haiku | 57.3% | 82.6% | **91.7%** | + +**Key insight:** SAP + GPT-3.5 turbo beats GPT-4o + structured outputs, saving you money while improving accuracy. + + + + + +**What SAP fixes automatically:** + +*Raw LLM Output:* +```json +// The model often outputs this mess: +{ + "name": John Doe, // Missing quotes + "skills": ["Python", "JavaScript",], // Trailing comma + "experience": 3.5 years, // Invalid type + "bio": "I'm a \"developer\"", // Unescaped quotes + /* some comment */ // JSON comments + "confidence": 9/10 // Fraction instead of decimal +} +``` + +*SAP Transforms to:* +```json +{ + "name": "John Doe", + "skills": ["Python", "JavaScript"], + "experience": 3.5, + "bio": "I'm a \"developer\"", + "confidence": 0.9 +} +``` + +**Error correction techniques:** +- Adds missing quotes around strings +- Removes trailing commas +- Strips comments and "yapping" +- Converts fractions to decimals +- Escapes special characters +- Handles incomplete JSON sequences + + + + + +**Traditional JSON Schema (verbose):** +```json +{ + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "The person's full name" + }, + "skills": { + "type": "array", + "items": {"type": "string"}, + "description": "List of technical skills" + }, + "experience": { + "type": "number", + "description": "Years of experience" + } + }, + "required": ["name", "skills"] +} +``` +*Token count: ~180 tokens* + +**BAML Schema (optimized):** +```baml +class Resume { + name string @description("The person's full name") + skills string[] @description("List of technical skills") + experience float? @description("Years of experience") +} +``` +*Token count: ~35 tokens* + +**80% token reduction** while being clearer to the model! + + + + + +**Traditional approach** - Choose reasoning OR structure: +```python +# Either get reasoning (unstructured) +reasoning = llm.complete("Analyze this resume and explain your thinking...") + +# OR get structure (no reasoning) +resume = llm.structured_output(resume_schema, text) +``` + +**BAML's SAP** - Get both in one call: +```baml +class ResumeAnalysis { + reasoning string @description("Step-by-step analysis") + name string + skills string[] + seniority_level SeniorityLevel + confidence_score float +} + +function AnalyzeResume(text: string) -> ResumeAnalysis { + client GPT4 + prompt #" + Analyze this resume step by step, then extract structured data. + + Resume: {{ text }} + + {{ ctx.output_format }} + "# +} +``` + +**Result:** Chain-of-thought reasoning AND structured output in a single API call. + + + + +### 4. **Production Features Built-In** +```baml +client RobustGPT4 { + provider openai + options { model "gpt-4o" } + retry_policy { + max_retries 3 + strategy exponential_backoff + } +} + +client SmartFallback { + provider fallback + options { + clients ["GPT4", "Claude", "GPT4Mini"] + } +} +``` + +### 5. **Token Optimization** +- See exact token usage for every call +- BAML's schema format uses 80% fewer tokens than JSON Schema +- Optimize prompts with instant feedback + +### 6. **Type Safety Everywhere** + +Generated BAML client with type safety + +```python +from baml_client import baml as b + +# Fully typed, works in Python, TypeScript, Java, Go +resume = await b.ExtractResume(resume_text) +print(resume.seniority) # Type: SeniorityLevel +``` + +*BAML generates fully typed clients for all languages automatically* + +**See how changes instantly update the prompt:** + +BAML prompt view updating in real-time as types change + +*Change your types → Prompt automatically updates → See the difference immediately* + +### 7. **Advanced Streaming with UI Integration** +BAML's semantic streaming lets you build real UIs with loading bars and type-safe implementations: + +```baml +class BlogPost { + title string @stream.done @stream.not_null + content string @stream.with_state +} +``` + +**What this enables:** +- **Loading bars** - Show progress as structured data streams in +- **Semantic guarantees** - Title only appears when complete, content streams token by token +- **Type-safe streaming** - Full TypeScript/Python types for partial data +- **UI state management** - Know exactly what's loading vs complete + + + +*See semantic streaming in action - structured data streaming with loading states* + +## The Bottom Line + +**You started with:** A simple LLM call +**You ended up with:** Hundreds of lines of infrastructure code + +**With BAML, you get:** +- The simplicity of your first attempt +- All the production features you built manually +- Better reliability than you could build yourself +- 10x faster development iteration +- Full control and transparency + +BAML is what LLM development should have been from the start. Ready to see the difference? [Get started with BAML](/guide/installation-language/python). \ No newline at end of file diff --git a/fern/docs.yml b/fern/docs.yml index 4a13a09e5e..2784e618db 100644 --- a/fern/docs.yml +++ b/fern/docs.yml @@ -285,6 +285,9 @@ navigation: - page: What is BAML? icon: fa-regular fa-question-circle path: 01-guide/what-are-function-definitions.mdx + - page: Why BAML? + icon: fa-regular fa-lightbulb + path: 01-guide/why-baml.mdx - page: What's the baml_src folder icon: fa-regular fa-folder path: 01-guide/what-is-baml_src.mdx @@ -421,9 +424,18 @@ navigation: path: 01-guide/07-observability/studio.mdx - section: Comparisons contents: + - page: BAML vs Langchain + icon: fa-solid fa-magnifying-glass + path: 01-guide/09-comparisons/langchain.mdx - page: BAML vs Marvin icon: fa-solid fa-magnifying-glass path: 01-guide/09-comparisons/marvin.mdx + - page: BAML vs Ai-SDK + icon: fa-solid fa-magnifying-glass + path: 01-guide/09-comparisons/ai-sdk.mdx + - page: BAML vs OpenAI SDK + icon: fa-solid fa-magnifying-glass + path: 01-guide/09-comparisons/openai-sdk.mdx - page: BAML vs Pydantic icon: fa-solid fa-magnifying-glass path: 01-guide/09-comparisons/pydantic.mdx