Skip to content

Commit

Permalink
Evals for TextExtract (#303)
Browse files Browse the repository at this point in the history
* move stockx to experimental

* accept param for extraction method

* add textextract specific evals

* accept textextract param

* fix instruction and accept textextract param

* update extract system prompt. add additional prompt if using textextract

* parse args before setting extract method

* fix errors from merge

* prettier

* dont use ay

* raise notimplementederror for textExtract

* rename textextract to text_extract

* add text_extract evals to CI

* prettier

* add useTextExtract to ExtractOptions

* run both extract methods in CI

* prettier

* continue on error until textExtract is implemented

* fix yml

* rm extra space

* fix indentation

* prettier

* increase time limit
  • Loading branch information
seanmcguire12 authored Dec 15, 2024
1 parent cdaf3ea commit d2b591d
Show file tree
Hide file tree
Showing 40 changed files with 1,361 additions and 86 deletions.
106 changes: 87 additions & 19 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ on:
env:
EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest"
EXPERIMENTAL_EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest,o1-mini,o1-preview"
EVAL_CATEGORIES: "observe,act,combination,extract,experimental"
EVAL_CATEGORIES: "observe,act,combination,extract,experimental,text_extract"

concurrency:
group: ${{ github.ref }}
Expand Down Expand Up @@ -67,7 +67,6 @@ jobs:
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
HEADLESS: true
EVAL_ENV: browserbase

steps:
- name: Check out repository code
uses: actions/checkout@v4
Expand All @@ -83,29 +82,98 @@ jobs:
- name: Install Playwright browsers
run: npm exec playwright install --with-deps

- name: Run Extract Evals
run: npm run evals category extract
# Run extract category with domExtract
- name: Run Extract Evals (domExtract)
run: npm run evals category extract -- --extract-method=domExtract
- name: Save Extract Dom Results
run: mv eval-summary.json eval-summary-extract-dom.json

# Run extract category with textExtract
- name: Run Extract Evals (textExtract)
run: npm run evals category extract -- --extract-method=textExtract
continue-on-error: true
# - name: Save Extract Text Results
# run: mv eval-summary.json eval-summary-extract-text.json

- name: Log Extract Evals Performance
- name: Log and Compare Extract Evals Performance
run: |
experimentName=$(jq -r '.experimentName' eval-summary.json)
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
if [ -f eval-summary.json ]; then
extract_score=$(jq '.categories.extract' eval-summary.json)
echo "Extract category score: $extract_score%"
if (( $(echo "$extract_score < 80" | bc -l) )); then
echo "Extract category score is below 80%. Failing CI."
exit 1
fi
else
echo "Eval summary not found for extract category. Failing CI."
experimentNameDom=$(jq -r '.experimentName' eval-summary-extract-dom.json)
dom_score=$(jq '.categories.extract' eval-summary-extract-dom.json)
echo "DomExtract Extract category score: $dom_score%"
echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}"
# experimentNameText=$(jq -r '.experimentName' eval-summary-extract-text.json)
# text_score=$(jq '.categories.extract' eval-summary-extract-text.json)
# echo "TextExtract Extract category score: $text_score%"
# echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}"
if (( $(echo "$dom_score < 80" | bc -l) )); then
echo "DomExtract extract category score is below 80%. Failing CI."
exit 1
fi
run-text-extract-evals:
needs: [run-extract-evals]
runs-on: ubuntu-latest
timeout-minutes: 40
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
HEADLESS: true
EVAL_ENV: browserbase
steps:
- name: Check out repository code
uses: actions/checkout@v4

- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"

- name: Install dependencies
run: npm install --no-frozen-lockfile

- name: Install Playwright browsers
run: npm exec playwright install --with-deps

# Run text_extract category with domExtract
- name: Run text_extract Evals (domExtract)
run: npm run evals category text_extract -- --extract-method=domExtract
- name: Save text_extract Dom Results
run: mv eval-summary.json eval-summary-text_extract-dom.json

# Run text_extract category with textExtract
- name: Run text_extract Evals (textExtract)
run: npm run evals category text_extract -- --extract-method=textExtract
continue-on-error: true
# - name: Save text_extract Text Results
# run: mv eval-summary.json eval-summary-text_extract-text.json

- name: Log and Compare text_extract Evals Performance
run: |
experimentNameDom=$(jq -r '.experimentName' eval-summary-text_extract-dom.json)
dom_score=$(jq '.categories.text_extract' eval-summary-text_extract-dom.json)
echo "DomExtract text_extract category score: $dom_score%"
echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}"
# experimentNameText=$(jq -r '.experimentName' eval-summary-text_extract-text.json)
# text_score=$(jq '.categories.text_extract' eval-summary-text_extract-text.json)
# echo "TextExtract text_extract category score: $text_score%"
# echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}"

# Fail CI only if textExtract is below 80%
# if (( $(echo "$text_score < 80" | bc -l) )); then
# echo "textExtract text_extract category score is below 80%. Failing CI."
# exit 1
# fi

run-act-evals:
runs-on: ubuntu-latest
timeout-minutes: 25
needs: [run-extract-evals]
needs: [run-text-extract-evals]
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
Expand Down Expand Up @@ -198,7 +266,7 @@ jobs:
run-combination-evals:
runs-on: ubuntu-latest
timeout-minutes: 25
timeout-minutes: 40
needs: [run-observe-evals]
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
Expand Down Expand Up @@ -246,7 +314,7 @@ jobs:
run-experimental-evals:
runs-on: ubuntu-latest
timeout-minutes: 120
needs: [run-combination-evals]
needs: [run-text-extract-evals]
if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev'
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
Expand Down
7 changes: 6 additions & 1 deletion evals/combination/allrecipes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ import { initStagehand } from "../utils";
import { EvalFunction } from "../../types/evals";
import { z } from "zod";

export const allrecipes: EvalFunction = async ({ modelName, logger }) => {
export const allrecipes: EvalFunction = async ({
modelName,
logger,
useTextExtract,
}) => {
const { stagehand, initResponse } = await initStagehand({
modelName,
logger,
Expand All @@ -28,6 +32,7 @@ export const allrecipes: EvalFunction = async ({ modelName, logger }) => {
.describe("Total number of ratings for the recipe"),
}),
modelName,
useTextExtract,
});

await stagehand.close();
Expand Down
12 changes: 9 additions & 3 deletions evals/combination/arxiv.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ import { EvalFunction } from "../../types/evals";
import { initStagehand } from "../utils";
import { z } from "zod";

export const arxiv: EvalFunction = async ({ modelName, logger }) => {
export const arxiv: EvalFunction = async ({
modelName,
logger,
useTextExtract,
}) => {
const { stagehand, initResponse } = await initStagehand({
modelName,
logger,
Expand All @@ -29,7 +33,8 @@ export const arxiv: EvalFunction = async ({ modelName, logger }) => {
)
.describe("list of papers"),
}),
modelName: "gpt-4o-2024-08-06",
modelName,
useTextExtract,
});

if (
Expand Down Expand Up @@ -86,7 +91,8 @@ export const arxiv: EvalFunction = async ({ modelName, logger }) => {
)
.nullable(),
}),
modelName: "gpt-4o-2024-08-06",
modelName,
useTextExtract,
});

papers.push({
Expand Down
2 changes: 2 additions & 0 deletions evals/combination/extract_collaborators.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { z } from "zod";
export const extract_collaborators: EvalFunction = async ({
modelName,
logger,
useTextExtract,
}) => {
const { stagehand, initResponse } = await initStagehand({
modelName,
Expand Down Expand Up @@ -32,6 +33,7 @@ export const extract_collaborators: EvalFunction = async ({
),
}),
modelName,
useTextExtract,
});

await stagehand.close();
Expand Down
2 changes: 2 additions & 0 deletions evals/combination/extract_github_commits.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { z } from "zod";
export const extract_github_commits: EvalFunction = async ({
modelName,
logger,
useTextExtract,
}) => {
const { stagehand, initResponse } = await initStagehand({
modelName,
Expand Down Expand Up @@ -32,6 +33,7 @@ export const extract_github_commits: EvalFunction = async ({
),
}),
modelName,
useTextExtract,
});

logger.log({
Expand Down
2 changes: 2 additions & 0 deletions evals/combination/imdb_movie_details.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { z } from "zod";
export const imdb_movie_details: EvalFunction = async ({
modelName,
logger,
useTextExtract,
}) => {
const { stagehand, initResponse } = await initStagehand({
modelName,
Expand All @@ -28,6 +29,7 @@ export const imdb_movie_details: EvalFunction = async ({
.describe("List of countries with the most ratings"),
}),
modelName,
useTextExtract,
});

await stagehand.close();
Expand Down
9 changes: 7 additions & 2 deletions evals/combination/peeler_complex.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ import { EvalFunction } from "../../types/evals";
import { initStagehand } from "../utils";
import { z } from "zod";

export const peeler_complex: EvalFunction = async ({ modelName, logger }) => {
export const peeler_complex: EvalFunction = async ({
modelName,
logger,
useTextExtract,
}) => {
const { stagehand, initResponse } = await initStagehand({
modelName,
logger,
Expand All @@ -27,7 +31,8 @@ export const peeler_complex: EvalFunction = async ({ modelName, logger }) => {
const { price } = await stagehand.extract({
instruction: "get the price of the peeler",
schema: z.object({ price: z.number().nullable() }),
modelName: "gpt-4o-2024-08-06",
modelName,
useTextExtract,
});

await stagehand.close();
Expand Down
7 changes: 6 additions & 1 deletion evals/combination/sciquest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ import { initStagehand } from "../utils";
import { EvalFunction } from "../../types/evals";
import { z } from "zod";

export const sciquest: EvalFunction = async ({ modelName, logger }) => {
export const sciquest: EvalFunction = async ({
modelName,
logger,
useTextExtract,
}) => {
const { stagehand, initResponse } = await initStagehand({
modelName,
logger,
Expand All @@ -25,6 +29,7 @@ export const sciquest: EvalFunction = async ({ modelName, logger }) => {
total_results: z.string(),
}),
modelName,
useTextExtract,
});

await stagehand.close();
Expand Down
3 changes: 3 additions & 0 deletions evals/experimental/combination_sauce.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { z } from "zod";
export const combination_sauce: EvalFunction = async ({
modelName,
logger,
useTextExtract,
}) => {
const { stagehand, initResponse } = await initStagehand({
modelName,
Expand All @@ -22,6 +23,8 @@ export const combination_sauce: EvalFunction = async ({
usernames: z.array(z.string()).describe("the accepted usernames"),
password: z.string().describe("the password for login"),
}),
modelName,
useTextExtract,
});

await stagehand.act({
Expand Down
9 changes: 7 additions & 2 deletions evals/experimental/costar.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ import { initStagehand } from "../utils";
import { EvalFunction } from "../../types/evals";
import { z } from "zod";

export const costar: EvalFunction = async ({ modelName, logger }) => {
export const costar: EvalFunction = async ({
modelName,
logger,
useTextExtract,
}) => {
const { stagehand, initResponse } = await initStagehand({
modelName,
logger,
Expand All @@ -29,7 +33,8 @@ export const costar: EvalFunction = async ({ modelName, logger }) => {
schema: z.object({
title: z.string().describe("the title of the article").nullable(),
}),
modelName: "gpt-4o-2024-08-06",
modelName,
useTextExtract,
});

logger.log({
Expand Down
3 changes: 3 additions & 0 deletions evals/experimental/extract_aigrant_companies.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { EvalFunction } from "../../types/evals";
export const extract_aigrant_companies: EvalFunction = async ({
modelName,
logger,
useTextExtract,
}) => {
const { stagehand, initResponse } = await initStagehand({
modelName,
Expand All @@ -26,6 +27,8 @@ export const extract_aigrant_companies: EvalFunction = async ({
}),
),
}),
modelName,
useTextExtract,
});

await stagehand.close();
Expand Down
2 changes: 2 additions & 0 deletions evals/experimental/extract_capacitor_info.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { z } from "zod";
export const extract_capacitor_info: EvalFunction = async ({
modelName,
logger,
useTextExtract,
}) => {
const { stagehand, initResponse } = await initStagehand({
modelName,
Expand All @@ -27,6 +28,7 @@ export const extract_capacitor_info: EvalFunction = async ({
min_operating_temp: z.string(),
}),
modelName,
useTextExtract,
});

await stagehand.close();
Expand Down
8 changes: 7 additions & 1 deletion evals/experimental/extract_partners.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ import { EvalFunction } from "../../types/evals";
import { initStagehand } from "../utils";
import { z } from "zod";

export const extract_partners: EvalFunction = async ({ modelName, logger }) => {
export const extract_partners: EvalFunction = async ({
modelName,
logger,
useTextExtract,
}) => {
const { stagehand, initResponse } = await initStagehand({
modelName,
logger,
Expand Down Expand Up @@ -40,6 +44,8 @@ export const extract_partners: EvalFunction = async ({ modelName, logger }) => {
.optional()
.describe("Any explanation about partner listing or absence thereof"),
}),
modelName,
useTextExtract,
});

const expectedPartners = [
Expand Down
Loading

0 comments on commit d2b591d

Please sign in to comment.