diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8d87d396..24035e70 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ on: env: EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest" EXPERIMENTAL_EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest,o1-mini,o1-preview" - EVAL_CATEGORIES: "observe,act,combination,extract,experimental" + EVAL_CATEGORIES: "observe,act,combination,extract,experimental,text_extract" concurrency: group: ${{ github.ref }} @@ -67,7 +67,6 @@ jobs: BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} HEADLESS: true EVAL_ENV: browserbase - steps: - name: Check out repository code uses: actions/checkout@v4 @@ -83,29 +82,98 @@ jobs: - name: Install Playwright browsers run: npm exec playwright install --with-deps - - name: Run Extract Evals - run: npm run evals category extract + # Run extract category with domExtract + - name: Run Extract Evals (domExtract) + run: npm run evals category extract -- --extract-method=domExtract + - name: Save Extract Dom Results + run: mv eval-summary.json eval-summary-extract-dom.json + + # Run extract category with textExtract + - name: Run Extract Evals (textExtract) + run: npm run evals category extract -- --extract-method=textExtract + continue-on-error: true + # - name: Save Extract Text Results + # run: mv eval-summary.json eval-summary-extract-text.json - - name: Log Extract Evals Performance + - name: Log and Compare Extract Evals Performance run: | - experimentName=$(jq -r '.experimentName' eval-summary.json) - echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" - if [ -f eval-summary.json ]; then - extract_score=$(jq '.categories.extract' eval-summary.json) - echo "Extract category score: $extract_score%" - if (( $(echo "$extract_score < 80" | bc -l) )); then - echo "Extract category score is below 80%. Failing CI." - exit 1 - fi - else - echo "Eval summary not found for extract category. Failing CI." + experimentNameDom=$(jq -r '.experimentName' eval-summary-extract-dom.json) + dom_score=$(jq '.categories.extract' eval-summary-extract-dom.json) + echo "DomExtract Extract category score: $dom_score%" + echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}" + + # experimentNameText=$(jq -r '.experimentName' eval-summary-extract-text.json) + # text_score=$(jq '.categories.extract' eval-summary-extract-text.json) + # echo "TextExtract Extract category score: $text_score%" + # echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}" + + if (( $(echo "$dom_score < 80" | bc -l) )); then + echo "DomExtract extract category score is below 80%. Failing CI." exit 1 fi + run-text-extract-evals: + needs: [run-extract-evals] + runs-on: ubuntu-latest + timeout-minutes: 40 + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} + BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} + BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} + HEADLESS: true + EVAL_ENV: browserbase + steps: + - name: Check out repository code + uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: "20" + + - name: Install dependencies + run: npm install --no-frozen-lockfile + + - name: Install Playwright browsers + run: npm exec playwright install --with-deps + + # Run text_extract category with domExtract + - name: Run text_extract Evals (domExtract) + run: npm run evals category text_extract -- --extract-method=domExtract + - name: Save text_extract Dom Results + run: mv eval-summary.json eval-summary-text_extract-dom.json + + # Run text_extract category with textExtract + - name: Run text_extract Evals (textExtract) + run: npm run evals category text_extract -- --extract-method=textExtract + continue-on-error: true + # - name: Save text_extract Text Results + # run: mv eval-summary.json eval-summary-text_extract-text.json + + - name: Log and Compare text_extract Evals Performance + run: | + experimentNameDom=$(jq -r '.experimentName' eval-summary-text_extract-dom.json) + dom_score=$(jq '.categories.text_extract' eval-summary-text_extract-dom.json) + echo "DomExtract text_extract category score: $dom_score%" + echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}" + + # experimentNameText=$(jq -r '.experimentName' eval-summary-text_extract-text.json) + # text_score=$(jq '.categories.text_extract' eval-summary-text_extract-text.json) + # echo "TextExtract text_extract category score: $text_score%" + # echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}" + + # Fail CI only if textExtract is below 80% + # if (( $(echo "$text_score < 80" | bc -l) )); then + # echo "textExtract text_extract category score is below 80%. Failing CI." + # exit 1 + # fi + run-act-evals: runs-on: ubuntu-latest timeout-minutes: 25 - needs: [run-extract-evals] + needs: [run-text-extract-evals] env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} @@ -198,7 +266,7 @@ jobs: run-combination-evals: runs-on: ubuntu-latest - timeout-minutes: 25 + timeout-minutes: 40 needs: [run-observe-evals] env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -246,7 +314,7 @@ jobs: run-experimental-evals: runs-on: ubuntu-latest timeout-minutes: 120 - needs: [run-combination-evals] + needs: [run-text-extract-evals] if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev' env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/evals/combination/allrecipes.ts b/evals/combination/allrecipes.ts index a8c6ec82..7756d087 100644 --- a/evals/combination/allrecipes.ts +++ b/evals/combination/allrecipes.ts @@ -2,7 +2,11 @@ import { initStagehand } from "../utils"; import { EvalFunction } from "../../types/evals"; import { z } from "zod"; -export const allrecipes: EvalFunction = async ({ modelName, logger }) => { +export const allrecipes: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { const { stagehand, initResponse } = await initStagehand({ modelName, logger, @@ -28,6 +32,7 @@ export const allrecipes: EvalFunction = async ({ modelName, logger }) => { .describe("Total number of ratings for the recipe"), }), modelName, + useTextExtract, }); await stagehand.close(); diff --git a/evals/combination/arxiv.ts b/evals/combination/arxiv.ts index 38cd00c1..c02bcfc9 100644 --- a/evals/combination/arxiv.ts +++ b/evals/combination/arxiv.ts @@ -2,7 +2,11 @@ import { EvalFunction } from "../../types/evals"; import { initStagehand } from "../utils"; import { z } from "zod"; -export const arxiv: EvalFunction = async ({ modelName, logger }) => { +export const arxiv: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { const { stagehand, initResponse } = await initStagehand({ modelName, logger, @@ -29,7 +33,8 @@ export const arxiv: EvalFunction = async ({ modelName, logger }) => { ) .describe("list of papers"), }), - modelName: "gpt-4o-2024-08-06", + modelName, + useTextExtract, }); if ( @@ -86,7 +91,8 @@ export const arxiv: EvalFunction = async ({ modelName, logger }) => { ) .nullable(), }), - modelName: "gpt-4o-2024-08-06", + modelName, + useTextExtract, }); papers.push({ diff --git a/evals/combination/extract_collaborators.ts b/evals/combination/extract_collaborators.ts index 2fb689ea..28b3e563 100644 --- a/evals/combination/extract_collaborators.ts +++ b/evals/combination/extract_collaborators.ts @@ -5,6 +5,7 @@ import { z } from "zod"; export const extract_collaborators: EvalFunction = async ({ modelName, logger, + useTextExtract, }) => { const { stagehand, initResponse } = await initStagehand({ modelName, @@ -32,6 +33,7 @@ export const extract_collaborators: EvalFunction = async ({ ), }), modelName, + useTextExtract, }); await stagehand.close(); diff --git a/evals/combination/extract_github_commits.ts b/evals/combination/extract_github_commits.ts index 48ff8cb0..10aebcae 100644 --- a/evals/combination/extract_github_commits.ts +++ b/evals/combination/extract_github_commits.ts @@ -5,6 +5,7 @@ import { z } from "zod"; export const extract_github_commits: EvalFunction = async ({ modelName, logger, + useTextExtract, }) => { const { stagehand, initResponse } = await initStagehand({ modelName, @@ -32,6 +33,7 @@ export const extract_github_commits: EvalFunction = async ({ ), }), modelName, + useTextExtract, }); logger.log({ diff --git a/evals/combination/imdb_movie_details.ts b/evals/combination/imdb_movie_details.ts index d62e6a41..136ccaab 100644 --- a/evals/combination/imdb_movie_details.ts +++ b/evals/combination/imdb_movie_details.ts @@ -5,6 +5,7 @@ import { z } from "zod"; export const imdb_movie_details: EvalFunction = async ({ modelName, logger, + useTextExtract, }) => { const { stagehand, initResponse } = await initStagehand({ modelName, @@ -28,6 +29,7 @@ export const imdb_movie_details: EvalFunction = async ({ .describe("List of countries with the most ratings"), }), modelName, + useTextExtract, }); await stagehand.close(); diff --git a/evals/combination/peeler_complex.ts b/evals/combination/peeler_complex.ts index 79583f4c..e3a274c1 100644 --- a/evals/combination/peeler_complex.ts +++ b/evals/combination/peeler_complex.ts @@ -2,7 +2,11 @@ import { EvalFunction } from "../../types/evals"; import { initStagehand } from "../utils"; import { z } from "zod"; -export const peeler_complex: EvalFunction = async ({ modelName, logger }) => { +export const peeler_complex: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { const { stagehand, initResponse } = await initStagehand({ modelName, logger, @@ -27,7 +31,8 @@ export const peeler_complex: EvalFunction = async ({ modelName, logger }) => { const { price } = await stagehand.extract({ instruction: "get the price of the peeler", schema: z.object({ price: z.number().nullable() }), - modelName: "gpt-4o-2024-08-06", + modelName, + useTextExtract, }); await stagehand.close(); diff --git a/evals/combination/sciquest.ts b/evals/combination/sciquest.ts index de6d600f..598bee8e 100644 --- a/evals/combination/sciquest.ts +++ b/evals/combination/sciquest.ts @@ -2,7 +2,11 @@ import { initStagehand } from "../utils"; import { EvalFunction } from "../../types/evals"; import { z } from "zod"; -export const sciquest: EvalFunction = async ({ modelName, logger }) => { +export const sciquest: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { const { stagehand, initResponse } = await initStagehand({ modelName, logger, @@ -25,6 +29,7 @@ export const sciquest: EvalFunction = async ({ modelName, logger }) => { total_results: z.string(), }), modelName, + useTextExtract, }); await stagehand.close(); diff --git a/evals/experimental/combination_sauce.ts b/evals/experimental/combination_sauce.ts index 55c89908..3f183137 100644 --- a/evals/experimental/combination_sauce.ts +++ b/evals/experimental/combination_sauce.ts @@ -5,6 +5,7 @@ import { z } from "zod"; export const combination_sauce: EvalFunction = async ({ modelName, logger, + useTextExtract, }) => { const { stagehand, initResponse } = await initStagehand({ modelName, @@ -22,6 +23,8 @@ export const combination_sauce: EvalFunction = async ({ usernames: z.array(z.string()).describe("the accepted usernames"), password: z.string().describe("the password for login"), }), + modelName, + useTextExtract, }); await stagehand.act({ diff --git a/evals/experimental/costar.ts b/evals/experimental/costar.ts index 91313700..83d93aae 100644 --- a/evals/experimental/costar.ts +++ b/evals/experimental/costar.ts @@ -2,7 +2,11 @@ import { initStagehand } from "../utils"; import { EvalFunction } from "../../types/evals"; import { z } from "zod"; -export const costar: EvalFunction = async ({ modelName, logger }) => { +export const costar: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { const { stagehand, initResponse } = await initStagehand({ modelName, logger, @@ -29,7 +33,8 @@ export const costar: EvalFunction = async ({ modelName, logger }) => { schema: z.object({ title: z.string().describe("the title of the article").nullable(), }), - modelName: "gpt-4o-2024-08-06", + modelName, + useTextExtract, }); logger.log({ diff --git a/evals/experimental/extract_aigrant_companies.ts b/evals/experimental/extract_aigrant_companies.ts index 82929a98..d99404c1 100644 --- a/evals/experimental/extract_aigrant_companies.ts +++ b/evals/experimental/extract_aigrant_companies.ts @@ -5,6 +5,7 @@ import { EvalFunction } from "../../types/evals"; export const extract_aigrant_companies: EvalFunction = async ({ modelName, logger, + useTextExtract, }) => { const { stagehand, initResponse } = await initStagehand({ modelName, @@ -26,6 +27,8 @@ export const extract_aigrant_companies: EvalFunction = async ({ }), ), }), + modelName, + useTextExtract, }); await stagehand.close(); diff --git a/evals/experimental/extract_capacitor_info.ts b/evals/experimental/extract_capacitor_info.ts index ed0d42a0..eabe36c2 100644 --- a/evals/experimental/extract_capacitor_info.ts +++ b/evals/experimental/extract_capacitor_info.ts @@ -6,6 +6,7 @@ import { z } from "zod"; export const extract_capacitor_info: EvalFunction = async ({ modelName, logger, + useTextExtract, }) => { const { stagehand, initResponse } = await initStagehand({ modelName, @@ -27,6 +28,7 @@ export const extract_capacitor_info: EvalFunction = async ({ min_operating_temp: z.string(), }), modelName, + useTextExtract, }); await stagehand.close(); diff --git a/evals/experimental/extract_partners.ts b/evals/experimental/extract_partners.ts index 24ba146a..7cd6a580 100644 --- a/evals/experimental/extract_partners.ts +++ b/evals/experimental/extract_partners.ts @@ -2,7 +2,11 @@ import { EvalFunction } from "../../types/evals"; import { initStagehand } from "../utils"; import { z } from "zod"; -export const extract_partners: EvalFunction = async ({ modelName, logger }) => { +export const extract_partners: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { const { stagehand, initResponse } = await initStagehand({ modelName, logger, @@ -40,6 +44,8 @@ export const extract_partners: EvalFunction = async ({ modelName, logger }) => { .optional() .describe("Any explanation about partner listing or absence thereof"), }), + modelName, + useTextExtract, }); const expectedPartners = [ diff --git a/evals/experimental/extract_press_releases.ts b/evals/experimental/extract_press_releases.ts index 5ca26010..713b98f0 100644 --- a/evals/experimental/extract_press_releases.ts +++ b/evals/experimental/extract_press_releases.ts @@ -5,6 +5,7 @@ import { z } from "zod"; export const extract_press_releases: EvalFunction = async ({ modelName, logger, + useTextExtract, }) => { const { stagehand, initResponse } = await initStagehand({ modelName, @@ -15,19 +16,28 @@ export const extract_press_releases: EvalFunction = async ({ const { debugUrl, sessionUrl } = initResponse; try { - await stagehand.page.goto("https://www.landerfornyc.com/news"); + await stagehand.page.goto("https://www.landerfornyc.com/news", { + waitUntil: "networkidle", + }); + await new Promise((resolve) => setTimeout(resolve, 5000)); const result = await stagehand.extract({ instruction: - "extract a list of press releases on this page, with the title and publish date", + "extract the title and corresponding publish date of EACH AND EVERY press releases on this page. DO NOT MISS ANY PRESS RELEASES.", schema: z.object({ items: z.array( z.object({ - title: z.string(), - publishedOn: z.string(), + title: z.string().describe("The title of the press release"), + publish_date: z + .string() + .describe( + "The date the press release was published, eg 'Oct 12, 2021'", + ), }), ), }), + modelName, + useTextExtract, }); await stagehand.close(); @@ -35,11 +45,11 @@ export const extract_press_releases: EvalFunction = async ({ const expectedLength = 28; const expectedFirstItem = { title: "UAW Region 9A Endorses Brad Lander for Mayor", - publishedOn: "Dec 4, 2024", + publish_date: "Dec 4, 2024", }; const expectedLastItem = { title: "An Unassuming Liberal Makes a Rapid Ascent to Power Broker", - publishedOn: "Jan 23, 2014", + publish_date: "Jan 23, 2014", }; if (items.length !== expectedLength) { @@ -68,10 +78,10 @@ export const extract_press_releases: EvalFunction = async ({ const firstItemMatches = items[0].title === expectedFirstItem.title && - items[0].publishedOn === expectedFirstItem.publishedOn; + items[0].publish_date === expectedFirstItem.publish_date; const lastItemMatches = items[items.length - 1].title === expectedLastItem.title && - items[items.length - 1].publishedOn === expectedLastItem.publishedOn; + items[items.length - 1].publish_date === expectedLastItem.publish_date; return { _success: firstItemMatches && lastItemMatches, diff --git a/evals/experimental/extract_snowshoeing_destinations.ts b/evals/experimental/extract_snowshoeing_destinations.ts index fa7bdc8c..67a5b3e7 100644 --- a/evals/experimental/extract_snowshoeing_destinations.ts +++ b/evals/experimental/extract_snowshoeing_destinations.ts @@ -5,6 +5,7 @@ import { EvalFunction } from "../../types/evals"; export const extract_snowshoeing_destinations: EvalFunction = async ({ modelName, logger, + useTextExtract, }) => { const { stagehand, initResponse } = await initStagehand({ modelName, @@ -40,6 +41,7 @@ export const extract_snowshoeing_destinations: EvalFunction = async ({ ), }), modelName, + useTextExtract, }); logger.log({ diff --git a/evals/experimental/google_jobs.ts b/evals/experimental/google_jobs.ts index 3562c019..48886d2a 100644 --- a/evals/experimental/google_jobs.ts +++ b/evals/experimental/google_jobs.ts @@ -2,7 +2,11 @@ import { EvalFunction } from "../../types/evals"; import { initStagehand } from "../utils"; import { z } from "zod"; -export const google_jobs: EvalFunction = async ({ modelName, logger }) => { +export const google_jobs: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { const { stagehand, initResponse } = await initStagehand({ modelName, logger, @@ -42,7 +46,8 @@ export const google_jobs: EvalFunction = async ({ modelName, logger }) => { .nullable(), }), }), - modelName: "gpt-4o-2024-08-06", + modelName, + useTextExtract, }); const isJobDetailsValid = diff --git a/evals/experimental/homedepot.ts b/evals/experimental/homedepot.ts index dcc2ffff..706298d1 100644 --- a/evals/experimental/homedepot.ts +++ b/evals/experimental/homedepot.ts @@ -2,7 +2,11 @@ import { EvalFunction } from "../../types/evals"; import { initStagehand } from "../utils"; import { z } from "zod"; -export const homedepot: EvalFunction = async ({ modelName, logger }) => { +export const homedepot: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { const { stagehand, initResponse } = await initStagehand({ modelName, logger, @@ -29,7 +33,8 @@ export const homedepot: EvalFunction = async ({ modelName, logger }) => { ) .describe("Gas grill Primary Burner BTU exact value"), }), - modelName: "gpt-4o-2024-08-06", + modelName, + useTextExtract, }); logger.log({ diff --git a/evals/act/stock_x.ts b/evals/experimental/stock_x.ts similarity index 100% rename from evals/act/stock_x.ts rename to evals/experimental/stock_x.ts diff --git a/evals/experimental/ted_talk.ts b/evals/experimental/ted_talk.ts index eed323c3..7625136e 100644 --- a/evals/experimental/ted_talk.ts +++ b/evals/experimental/ted_talk.ts @@ -3,7 +3,11 @@ import { initStagehand } from "../utils"; import { normalizeString } from "../utils"; import { z } from "zod"; -export const ted_talk: EvalFunction = async ({ modelName, logger }) => { +export const ted_talk: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { const { stagehand, initResponse } = await initStagehand({ modelName, logger, @@ -36,6 +40,7 @@ export const ted_talk: EvalFunction = async ({ modelName, logger }) => { .describe("List of culture video playlists"), }), modelName, + useTextExtract, }); await stagehand.close(); diff --git a/evals/extract/extract_baptist_health.ts b/evals/extract/extract_baptist_health.ts index 5b257619..7be6fe41 100644 --- a/evals/extract/extract_baptist_health.ts +++ b/evals/extract/extract_baptist_health.ts @@ -6,6 +6,7 @@ import { compareStrings } from "../utils"; export const extract_baptist_health: EvalFunction = async ({ modelName, logger, + useTextExtract, }) => { const { stagehand, initResponse } = await initStagehand({ modelName, @@ -27,6 +28,7 @@ export const extract_baptist_health: EvalFunction = async ({ fax: z.string(), }), modelName, + useTextExtract, }); await stagehand.close(); diff --git a/evals/extract/extract_github_stars.ts b/evals/extract/extract_github_stars.ts index 785e0810..93637a7c 100644 --- a/evals/extract/extract_github_stars.ts +++ b/evals/extract/extract_github_stars.ts @@ -5,6 +5,7 @@ import { z } from "zod"; export const extract_github_stars: EvalFunction = async ({ modelName, logger, + useTextExtract, }) => { const { stagehand, initResponse } = await initStagehand({ modelName, @@ -22,6 +23,7 @@ export const extract_github_stars: EvalFunction = async ({ stars: z.number().describe("the number of stars for the project"), }), modelName, + useTextExtract, }); const expectedStarsString = await stagehand.page diff --git a/evals/extract/extract_memorial_healthcare.ts b/evals/extract/extract_memorial_healthcare.ts index 9349c4c2..7cc4a5e1 100644 --- a/evals/extract/extract_memorial_healthcare.ts +++ b/evals/extract/extract_memorial_healthcare.ts @@ -6,6 +6,7 @@ import { compareStrings } from "../utils"; export const extract_memorial_healthcare: EvalFunction = async ({ modelName, logger, + useTextExtract, }) => { const { stagehand, initResponse } = await initStagehand({ modelName, @@ -29,6 +30,8 @@ export const extract_memorial_healthcare: EvalFunction = async ({ }), ), }), + modelName, + useTextExtract, }); await stagehand.close(); diff --git a/evals/extract/extract_nhl_stats.ts b/evals/extract/extract_nhl_stats.ts index 61ceb8c1..8e1edb62 100644 --- a/evals/extract/extract_nhl_stats.ts +++ b/evals/extract/extract_nhl_stats.ts @@ -6,6 +6,7 @@ import { z } from "zod"; export const extract_nhl_stats: EvalFunction = async ({ modelName, logger, + useTextExtract, }) => { const { stagehand, initResponse } = await initStagehand({ modelName, @@ -31,6 +32,7 @@ export const extract_nhl_stats: EvalFunction = async ({ team: z.string(), }), modelName, + useTextExtract, }); await stagehand.close(); diff --git a/evals/extract/extract_professional_info.ts b/evals/extract/extract_professional_info.ts index 46c57517..b3f6dbcb 100644 --- a/evals/extract/extract_professional_info.ts +++ b/evals/extract/extract_professional_info.ts @@ -6,6 +6,7 @@ import { z } from "zod"; export const extract_professional_info: EvalFunction = async ({ modelName, logger, + useTextExtract, }) => { const { stagehand, initResponse } = await initStagehand({ modelName, @@ -27,6 +28,7 @@ export const extract_professional_info: EvalFunction = async ({ fax: z.string(), }), modelName, + useTextExtract, }); await stagehand.close(); diff --git a/evals/extract/extract_regulations.ts b/evals/extract/extract_regulations.ts index 90395023..7a4f1711 100644 --- a/evals/extract/extract_regulations.ts +++ b/evals/extract/extract_regulations.ts @@ -5,6 +5,7 @@ import { z } from "zod"; export const extract_regulations: EvalFunction = async ({ modelName, logger, + useTextExtract, }) => { const { stagehand, initResponse } = await initStagehand({ modelName, @@ -27,6 +28,7 @@ export const extract_regulations: EvalFunction = async ({ ), }), modelName, + useTextExtract, }); await stagehand.close(); diff --git a/evals/extract/extract_resistor_info.ts b/evals/extract/extract_resistor_info.ts index 97c9db6f..539bac48 100644 --- a/evals/extract/extract_resistor_info.ts +++ b/evals/extract/extract_resistor_info.ts @@ -6,6 +6,7 @@ import { z } from "zod"; export const extract_resistor_info: EvalFunction = async ({ modelName, logger, + useTextExtract, }) => { const { stagehand, initResponse } = await initStagehand({ modelName, @@ -26,6 +27,7 @@ export const extract_resistor_info: EvalFunction = async ({ operating_temperature_range: z.string(), }), modelName, + useTextExtract, }); await stagehand.close(); diff --git a/evals/extract/extract_rockauto.ts b/evals/extract/extract_rockauto.ts index a630b0cc..6de5904d 100644 --- a/evals/extract/extract_rockauto.ts +++ b/evals/extract/extract_rockauto.ts @@ -2,7 +2,11 @@ import { EvalFunction } from "../../types/evals"; import { initStagehand } from "../utils"; import { z } from "zod"; -export const extract_rockauto: EvalFunction = async ({ modelName, logger }) => { +export const extract_rockauto: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { const { stagehand, initResponse } = await initStagehand({ modelName, logger, @@ -26,6 +30,7 @@ export const extract_rockauto: EvalFunction = async ({ modelName, logger }) => { ), }), modelName, + useTextExtract, domSettleTimeoutMs: 10000, }); diff --git a/evals/extract/extract_staff_members.ts b/evals/extract/extract_staff_members.ts index 8088fc15..36ee0d12 100644 --- a/evals/extract/extract_staff_members.ts +++ b/evals/extract/extract_staff_members.ts @@ -5,6 +5,7 @@ import { EvalFunction } from "../../types/evals"; export const extract_staff_members: EvalFunction = async ({ modelName, logger, + useTextExtract, }) => { const { stagehand, initResponse } = await initStagehand({ modelName, @@ -27,6 +28,8 @@ export const extract_staff_members: EvalFunction = async ({ }), ), }), + modelName, + useTextExtract, }); const staff_members = result.staff_members; diff --git a/evals/index.eval.ts b/evals/index.eval.ts index 433c5cf9..54cedd10 100644 --- a/evals/index.eval.ts +++ b/evals/index.eval.ts @@ -15,9 +15,38 @@ import { import { AvailableModel, AvailableModelSchema } from "../types/model"; import { EvalLogger, env } from "./utils"; +const args = process.argv.slice(2); + const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES ? process.env.EVAL_CATEGORIES.split(",") - : ["observe", "act", "combination", "extract", "experimental"]; + : [ + "observe", + "act", + "combination", + "extract", + "experimental", + "text_extract", + ]; + +let extractMethod = "domExtract"; +const extractMethodArg = args.find((arg) => + arg.startsWith("--extract-method="), +); +if (extractMethodArg) { + extractMethod = extractMethodArg.split("=")[1]; + + if (extractMethod === "textExtract") { + throw new Error( + "NotImplementedError: textExtract method is not implemented on this branch.", + ); + } +} + +process.env.EXTRACT_METHOD = extractMethod; +const useTextExtract = process.env.EXTRACT_METHOD === "textExtract"; + +let filterByCategory: string | null = null; +let filterByEvalName: string | null = null; const CATEGORIES: EvalCategory[] = DEFAULT_EVAL_CATEGORIES.map((category) => { if (!EvalCategorySchema.safeParse(category).success) { @@ -27,6 +56,28 @@ const CATEGORIES: EvalCategory[] = DEFAULT_EVAL_CATEGORIES.map((category) => { return category as EvalCategory; }); +if (args.length > 0) { + if (args[0].toLowerCase() === "category") { + filterByCategory = args[1]; + if (!filterByCategory) { + console.error("Error: Category name not specified."); + process.exit(1); + } + try { + EvalCategorySchema.parse(filterByCategory); + } catch { + console.error( + `Error: Invalid category "${filterByCategory}". Valid categories are: ${CATEGORIES.join( + ", ", + )}`, + ); + process.exit(1); + } + } else { + filterByEvalName = args[0]; + } +} + const generateTasksAndCategories = (): { tasks: Record< string, @@ -44,11 +95,11 @@ const generateTasksAndCategories = (): { > = {}; const taskCategories: Record = {}; - CATEGORIES.map((category) => { + CATEGORIES.forEach((category) => { const categoryPath = path.join(__dirname, category); try { const files = fs.readdirSync(categoryPath); - files.map((file) => { + files.forEach((file) => { if (file.endsWith(".ts")) { const taskName = file.replace(".ts", ""); const taskModule = import(`./${category}/${taskName}`) as Promise<{ @@ -69,34 +120,9 @@ const generateTasksAndCategories = (): { const { tasks, taskCategories } = generateTasksAndCategories(); -const args = process.argv.slice(2); -let filterByCategory: string | null = null; -let filterByEvalName: string | null = null; - -if (args.length > 0) { - if (args[0].toLowerCase() === "category") { - filterByCategory = args[1]; - if (!filterByCategory) { - console.error("Error: Category name not specified."); - process.exit(1); - } - try { - EvalCategorySchema.parse(filterByCategory); - } catch { - console.error( - `Error: Invalid category "${filterByCategory}". Valid categories are: ${CATEGORIES.join( - ", ", - )}`, - ); - process.exit(1); - } - } else { - filterByEvalName = args[0]; - if (!Object.keys(tasks).includes(filterByEvalName)) { - console.error(`Error: Evaluation "${filterByEvalName}" does not exist.`); - process.exit(1); - } - } +if (filterByEvalName && !Object.keys(tasks).includes(filterByEvalName)) { + console.error(`Error: Evaluation "${filterByEvalName}" does not exist.`); + process.exit(1); } const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS @@ -320,6 +346,7 @@ const generateFilteredTestcases = (): Testcase[] => { const result = await taskFunction({ modelName: input.modelName, logger, + useTextExtract, }); if (result && result._success) { console.log(`✅ ${input.name}: Passed`); diff --git a/evals/text_extract/extract_aigrant_companies.ts b/evals/text_extract/extract_aigrant_companies.ts new file mode 100644 index 00000000..d99404c1 --- /dev/null +++ b/evals/text_extract/extract_aigrant_companies.ts @@ -0,0 +1,133 @@ +import { z } from "zod"; +import { initStagehand } from "../utils"; +import { EvalFunction } from "../../types/evals"; + +export const extract_aigrant_companies: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { + const { stagehand, initResponse } = await initStagehand({ + modelName, + logger, + domSettleTimeoutMs: 3000, + }); + + const { debugUrl, sessionUrl } = initResponse; + + await stagehand.page.goto("https://aigrant.com/"); + const companyList = await stagehand.extract({ + instruction: + "Extract all companies that received the AI grant and group them with their batch numbers as an array of objects. Each object should contain the company name and its corresponding batch number.", + schema: z.object({ + companies: z.array( + z.object({ + company: z.string(), + batch: z.string(), + }), + ), + }), + modelName, + useTextExtract, + }); + + await stagehand.close(); + const companies = companyList.companies; + const expectedLength = 91; + + const expectedFirstItem = { + company: "Goodfire", + batch: "4", + }; + + const expectedLastItem = { + company: "Forefront", + batch: "1", + }; + + if (companies.length !== expectedLength) { + logger.error({ + message: "Incorrect number of companies extracted", + level: 0, + auxiliary: { + expected: { + value: expectedLength.toString(), + type: "integer", + }, + actual: { + value: companies.length.toString(), + type: "integer", + }, + }, + }); + return { + _success: false, + error: "Incorrect number of companies extracted", + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } + const firstItemMatches = + companies[0].company === expectedFirstItem.company && + companies[0].batch === expectedFirstItem.batch; + + if (!firstItemMatches) { + logger.error({ + message: "First company extracted does not match expected", + level: 0, + auxiliary: { + expected: { + value: JSON.stringify(expectedFirstItem), + type: "object", + }, + actual: { + value: JSON.stringify(companies[0]), + type: "object", + }, + }, + }); + return { + _success: false, + error: "First company extracted does not match expected", + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } + + const lastItemMatches = + companies[companies.length - 1].company === expectedLastItem.company && + companies[companies.length - 1].batch === expectedLastItem.batch; + + if (!lastItemMatches) { + logger.error({ + message: "Last company extracted does not match expected", + level: 0, + auxiliary: { + expected: { + value: JSON.stringify(expectedLastItem), + type: "object", + }, + actual: { + value: JSON.stringify(companies[companies.length - 1]), + type: "object", + }, + }, + }); + return { + _success: false, + error: "Last company extracted does not match expected", + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } + + return { + _success: true, + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; +}; diff --git a/evals/text_extract/extract_area_codes.ts b/evals/text_extract/extract_area_codes.ts new file mode 100644 index 00000000..8886bf56 --- /dev/null +++ b/evals/text_extract/extract_area_codes.ts @@ -0,0 +1,161 @@ +import { EvalFunction } from "../../types/evals"; +import { initStagehand } from "../utils"; +import { z } from "zod"; + +export const extract_area_codes: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { + const { stagehand, initResponse } = await initStagehand({ + modelName, + logger, + }); + + const { debugUrl, sessionUrl } = initResponse; + + await stagehand.init(); + await stagehand.page.goto( + "https://www.ncc.gov.ng/technical-regulation/standards/numbering#area-codes-by-zone-primary-centre", + { waitUntil: "domcontentloaded" }, + ); + + const result = await stagehand.extract({ + instruction: + "Extract ALL the Primary Center names and their corresponding Area Code, and the name of their corresponding Zone.", + schema: z.object({ + primary_center_list: z.array( + z.object({ + zone_name: z + .string() + .describe( + "The name of the Zone that the Primary Center is in. For example, 'North Central Zone'.", + ), + primary_center_name: z + .string() + .describe( + "The name of the Primary Center. I.e., this is the name of the city or town.", + ), + area_code: z + .string() + .describe( + "The area code for the Primary Center. This will either be 2 or 3 digits.", + ), + }), + ), + }), + modelName, + useTextExtract, + }); + + await stagehand.close(); + + const primaryCenterList = result.primary_center_list; + const expectedLength = 56; + + const expectedFirstItem = { + zone_name: "Lagos Zone", + primary_center_name: "Lagos", + area_code: "01", + }; + + const expectedLastItem = { + zone_name: "South-East Zone", + primary_center_name: "Yenagoa", + area_code: "089", + }; + + if (primaryCenterList.length !== expectedLength) { + logger.error({ + message: "Incorrect number of primary centers extracted", + level: 0, + auxiliary: { + expected: { + value: expectedLength.toString(), + type: "integer", + }, + actual: { + value: primaryCenterList.length.toString(), + type: "integer", + }, + }, + }); + return { + _success: false, + error: "Incorrect number of primary centers extracted", + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } + const firstItemMatches = + primaryCenterList[0].zone_name === expectedFirstItem.zone_name && + primaryCenterList[0].primary_center_name === + expectedFirstItem.primary_center_name && + primaryCenterList[0].area_code === expectedFirstItem.area_code; + + if (!firstItemMatches) { + logger.error({ + message: "First primary center extracted does not match expected", + level: 0, + auxiliary: { + expected: { + value: JSON.stringify(expectedFirstItem), + type: "object", + }, + actual: { + value: JSON.stringify(primaryCenterList[0]), + type: "object", + }, + }, + }); + return { + _success: false, + error: "First primary center extracted does not match expected", + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } + + const lastItemMatches = + primaryCenterList[primaryCenterList.length - 1].zone_name === + expectedLastItem.zone_name && + primaryCenterList[primaryCenterList.length - 1].primary_center_name === + expectedLastItem.primary_center_name && + primaryCenterList[primaryCenterList.length - 1].area_code === + expectedLastItem.area_code; + + if (!lastItemMatches) { + logger.error({ + message: "Last primary center extracted does not match expected", + level: 0, + auxiliary: { + expected: { + value: JSON.stringify(expectedLastItem), + type: "object", + }, + actual: { + value: JSON.stringify( + primaryCenterList[primaryCenterList.length - 1], + ), + type: "object", + }, + }, + }); + return { + _success: false, + error: "Last primary center extracted does not match expected", + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } + + return { + _success: true, + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; +}; diff --git a/evals/text_extract/extract_capacitor_info.ts b/evals/text_extract/extract_capacitor_info.ts new file mode 100644 index 00000000..eabe36c2 --- /dev/null +++ b/evals/text_extract/extract_capacitor_info.ts @@ -0,0 +1,132 @@ +import { EvalFunction } from "../../types/evals"; +import { initStagehand } from "../utils"; +import { normalizeString } from "../utils"; +import { z } from "zod"; + +export const extract_capacitor_info: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { + const { stagehand, initResponse } = await initStagehand({ + modelName, + logger, + }); + + const { debugUrl, sessionUrl } = initResponse; + + await stagehand.page.goto( + "https://www.tti.com/content/ttiinc/en/apps/part-detail.html?partsNumber=C320C104K5R5TA&mfgShortname=KEM&productId=6335148", + ); + + const result = await stagehand.extract({ + instruction: + "Extract the TTI Part Number, Product Category, and minimum operating temperature of the capacitor.", + schema: z.object({ + tti_part_number: z.string(), + product_category: z.string(), + min_operating_temp: z.string(), + }), + modelName, + useTextExtract, + }); + + await stagehand.close(); + + const { tti_part_number, product_category, min_operating_temp } = result; + + const expected = { + tti_part_number: "C320C104K5R5TA", + product_category: "Multilayer Ceramic Capacitors MLCC - Leaded", + min_operating_temp: "- 55 C", + }; + + if ( + normalizeString(tti_part_number) !== + normalizeString(expected.tti_part_number) + ) { + logger.error({ + message: "TTI Part Number extracted does not match expected", + level: 0, + auxiliary: { + expected: { + value: normalizeString(expected.tti_part_number), + type: "string", + }, + actual: { + value: normalizeString(tti_part_number), + type: "string", + }, + }, + }); + return { + _success: false, + error: "TTI Part Number extracted does not match expected", + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } + + if ( + normalizeString(product_category) !== + normalizeString(expected.product_category) + ) { + logger.error({ + message: "Product Category extracted does not match expected", + level: 0, + auxiliary: { + expected: { + value: normalizeString(expected.product_category), + type: "string", + }, + actual: { + value: normalizeString(product_category), + type: "string", + }, + }, + }); + return { + _success: false, + error: "Product Category extracted does not match expected", + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } + + if ( + normalizeString(min_operating_temp) !== + normalizeString(expected.min_operating_temp) + ) { + logger.error({ + message: + "Minimum operating temperature extracted does not match expected", + level: 0, + auxiliary: { + expected: { + value: normalizeString(expected.min_operating_temp), + type: "string", + }, + actual: { + value: normalizeString(min_operating_temp), + type: "string", + }, + }, + }); + return { + _success: false, + error: "Minimum operating temperature extracted does not match expected", + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } + + return { + _success: true, + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; +}; diff --git a/evals/text_extract/extract_partners.ts b/evals/text_extract/extract_partners.ts new file mode 100644 index 00000000..7cd6a580 --- /dev/null +++ b/evals/text_extract/extract_partners.ts @@ -0,0 +1,101 @@ +import { EvalFunction } from "../../types/evals"; +import { initStagehand } from "../utils"; +import { z } from "zod"; + +export const extract_partners: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { + const { stagehand, initResponse } = await initStagehand({ + modelName, + logger, + }); + + const { debugUrl, sessionUrl } = initResponse; + + try { + await stagehand.page.goto("https://ramp.com"); + + await stagehand.act({ + action: "move down to the bottom of the page.", + }); + + await stagehand.act({ + action: "Close the popup.", + }); + + await stagehand.act({ + action: "Find and click on the link that leads to the partners page.", + }); + + const partners = await stagehand.extract({ + instruction: ` + Extract all of the partner categories on the page. + `, + schema: z.object({ + partners: z.array( + z.object({ + partner_category: z.string().describe("The partner category"), + }), + ), + explanation: z + .string() + .optional() + .describe("Any explanation about partner listing or absence thereof"), + }), + modelName, + useTextExtract, + }); + + const expectedPartners = [ + "Accounting Partners", + "Private Equity & Venture Capital Partners", + "Services Partners", + "Affiliates", + ]; + + const foundPartners = partners.partners.map((partner) => + partner.partner_category.toLowerCase(), + ); + + const allExpectedPartnersFound = expectedPartners.every((partner) => + foundPartners.includes(partner.toLowerCase()), + ); + + await stagehand.close(); + + return { + _success: allExpectedPartnersFound, + partners, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + logger.error({ + message: "error in extractPartners function", + level: 0, + auxiliary: { + error: { + value: error.message, + type: "string", + }, + trace: { + value: error.stack, + type: "string", + }, + }, + }); + + await stagehand.close(); + + return { + _success: false, + debugUrl, + sessionUrl, + error: JSON.parse(JSON.stringify(error, null, 2)), + logs: logger.getLogs(), + }; + } +}; diff --git a/evals/text_extract/extract_press_releases.ts b/evals/text_extract/extract_press_releases.ts new file mode 100644 index 00000000..fdce6bc3 --- /dev/null +++ b/evals/text_extract/extract_press_releases.ts @@ -0,0 +1,118 @@ +import { EvalFunction } from "../../types/evals"; +import { initStagehand } from "../utils"; +import { z } from "zod"; + +export const extract_press_releases: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { + const { stagehand, initResponse } = await initStagehand({ + modelName, + logger, + domSettleTimeoutMs: 3000, + }); + + const { debugUrl, sessionUrl } = initResponse; + + try { + await stagehand.page.goto("https://www.landerfornyc.com/news", { + waitUntil: "networkidle", + }); + // timeout for 5 seconds to allow for the page to load + await new Promise((resolve) => setTimeout(resolve, 5000)); + + const result = await stagehand.extract({ + instruction: + "extract the title and corresponding publish date of EACH AND EVERY press releases on this page. DO NOT MISS ANY PRESS RELEASES.", + schema: z.object({ + items: z.array( + z.object({ + title: z.string().describe("The title of the press release"), + publish_date: z + .string() + .describe( + "The date the press release was published, eg 'Oct 12, 2021'", + ), + }), + ), + }), + modelName, + useTextExtract, + }); + + await stagehand.close(); + const items = result.items; + const expectedLength = 28; + const expectedFirstItem = { + title: "UAW Region 9A Endorses Brad Lander for Mayor", + publish_date: "Dec 4, 2024", + }; + const expectedLastItem = { + title: "An Unassuming Liberal Makes a Rapid Ascent to Power Broker", + publish_date: "Jan 23, 2014", + }; + + if (items.length !== expectedLength) { + logger.error({ + message: "Incorrect number of items extracted", + level: 0, + auxiliary: { + expected: { + value: expectedLength.toString(), + type: "integer", + }, + actual: { + value: items.length.toString(), + type: "integer", + }, + }, + }); + return { + _success: false, + error: "Incorrect number of items extracted", + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } + + const firstItemMatches = + items[0].title === expectedFirstItem.title && + items[0].publish_date === expectedFirstItem.publish_date; + const lastItemMatches = + items[items.length - 1].title === expectedLastItem.title && + items[items.length - 1].publish_date === expectedLastItem.publish_date; + + return { + _success: firstItemMatches && lastItemMatches, + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } catch (error) { + logger.error({ + message: `Error in extract_press_releases function`, + level: 0, + auxiliary: { + error: { + value: error.message || JSON.stringify(error), + type: "string", + }, + trace: { + value: error.stack, + type: "string", + }, + }, + }); + return { + _success: false, + error: "An error occurred during extraction", + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } finally { + await stagehand.context.close(); + } +}; diff --git a/evals/text_extract/extract_public_notices.ts b/evals/text_extract/extract_public_notices.ts new file mode 100644 index 00000000..70fc1eb8 --- /dev/null +++ b/evals/text_extract/extract_public_notices.ts @@ -0,0 +1,180 @@ +import { EvalFunction } from "../../types/evals"; +import { initStagehand } from "../utils"; +import { z } from "zod"; +import { compareStrings } from "../utils"; + +export const extract_public_notices: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { + const { stagehand, initResponse } = await initStagehand({ + modelName, + logger, + }); + + const { debugUrl, sessionUrl } = initResponse; + + await stagehand.init(); + await stagehand.page.goto( + "https://www.sars.gov.za/legal-counsel/secondary-legislation/public-notices/", + { waitUntil: "load" }, + ); + + const result = await stagehand.extract({ + instruction: + "Extract ALL the public notice descriptions with their corresponding, GG number and publication date. Extract ALL notices from 2024 through 2020. Do not include the Notice number.", + schema: z.object({ + public_notices: z.array( + z.object({ + notice_description: z + .string() + .describe( + "the description of the notice. Do not include the Notice number", + ), + gg_number: z + .string() + .describe("the GG number of the notice. For example, GG 12345"), + publication_date: z + .string() + .describe( + "the publication date of the notice. For example, 8 December 2021", + ), + }), + ), + }), + modelName, + useTextExtract, + }); + + await stagehand.close(); + + const publicNotices = result.public_notices; + const expectedLength = 24; + + const expectedFirstItem = { + notice_description: + "Additional considerations in terms of section 80(2) in respect of which an application for a binding private ruling or a binding class ruling may be rejected", + gg_number: "GG 51526", + publication_date: "8 November 2024", + }; + + const expectedLastItem = { + notice_description: + "Notice in terms of section 25, read with section 66(1) of the Income Tax Act, 1962, for submission of 2020 income tax returns", + gg_number: "GG 43495", + publication_date: "3 July 2020", + }; + + if (publicNotices.length !== expectedLength) { + logger.error({ + message: "Incorrect number of public notices extracted", + level: 0, + auxiliary: { + expected: { + value: expectedLength.toString(), + type: "integer", + }, + actual: { + value: publicNotices.length.toString(), + type: "integer", + }, + }, + }); + return { + _success: false, + error: "Incorrect number of public notices extracted", + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } + const firstItemMatches = + compareStrings( + publicNotices[0].notice_description, + expectedFirstItem.notice_description, + 0.9, + ) && + compareStrings( + publicNotices[0].gg_number, + expectedFirstItem.gg_number, + 0.9, + ) && + compareStrings( + publicNotices[0].publication_date, + expectedFirstItem.publication_date, + 0.9, + ); + + if (!firstItemMatches) { + logger.error({ + message: "First public notice extracted does not match expected", + level: 0, + auxiliary: { + expected: { + value: JSON.stringify(expectedFirstItem), + type: "object", + }, + actual: { + value: JSON.stringify(publicNotices[0]), + type: "object", + }, + }, + }); + return { + _success: false, + error: "First public notice extracted does not match expected", + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } + + const lastItemMatches = + compareStrings( + publicNotices[publicNotices.length - 1].notice_description, + expectedLastItem.notice_description, + 0.9, + ) && + compareStrings( + publicNotices[publicNotices.length - 1].gg_number, + expectedLastItem.gg_number, + 0.9, + ) && + compareStrings( + publicNotices[publicNotices.length - 1].publication_date, + expectedLastItem.publication_date, + 0.9, + ); + + if (!lastItemMatches) { + logger.error({ + message: "Last public notice extracted does not match expected", + level: 0, + auxiliary: { + expected: { + value: JSON.stringify(expectedLastItem), + type: "object", + }, + actual: { + value: JSON.stringify(publicNotices[publicNotices.length - 1]), + type: "object", + }, + }, + }); + return { + _success: false, + error: "Last public notice extracted does not match expected", + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } + + return { + _success: true, + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; +}; diff --git a/evals/text_extract/extract_research_reports.ts b/evals/text_extract/extract_research_reports.ts new file mode 100644 index 00000000..6b1f36f2 --- /dev/null +++ b/evals/text_extract/extract_research_reports.ts @@ -0,0 +1,138 @@ +import { EvalFunction } from "../../types/evals"; +import { initStagehand } from "../utils"; +import { z } from "zod"; + +export const extract_research_reports: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { + const { stagehand, initResponse } = await initStagehand({ + modelName, + logger, + }); + + const { debugUrl, sessionUrl } = initResponse; + + await stagehand.init(); + await stagehand.page.goto( + "http://www.dsbd.gov.za/index.php/research-reports", + { waitUntil: "load" }, + ); + + const result = await stagehand.extract({ + instruction: + "Extract ALL the research report names. Do not extract the names of the PDF attachments.", + schema: z.object({ + reports: z.array( + z.object({ + report_name: z + .string() + .describe( + "The name or title of the research report. NOT the name of the PDF attachment.", + ), + }), + ), + }), + modelName, + useTextExtract, + }); + + await stagehand.close(); + + const reports = result.reports; + const expectedLength = 9; + + const expectedFirstItem = { + report_name: + "Longitudinal Study on SMMEs and Co-operatives in South Africa and the study on the Eastern SeaBoard", + }; + + const expectedLastItem = { + report_name: "Research Agenda", + }; + + if (reports.length !== expectedLength) { + logger.error({ + message: "Incorrect number of reports extracted", + level: 0, + auxiliary: { + expected: { + value: expectedLength.toString(), + type: "integer", + }, + actual: { + value: reports.length.toString(), + type: "integer", + }, + }, + }); + return { + _success: false, + error: "Incorrect number of reports extracted", + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } + const firstItemMatches = + reports[0].report_name === expectedFirstItem.report_name; + + if (!firstItemMatches) { + logger.error({ + message: "First report extracted does not match expected", + level: 0, + auxiliary: { + expected: { + value: JSON.stringify(expectedFirstItem), + type: "object", + }, + actual: { + value: JSON.stringify(reports[0]), + type: "object", + }, + }, + }); + return { + _success: false, + error: "First report extracted does not match expected", + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } + + const lastItemMatches = + reports[reports.length - 1].report_name === expectedLastItem.report_name; + + if (!lastItemMatches) { + logger.error({ + message: "Last report extracted does not match expected", + level: 0, + auxiliary: { + expected: { + value: JSON.stringify(expectedLastItem), + type: "object", + }, + actual: { + value: JSON.stringify(reports[reports.length - 1]), + type: "object", + }, + }, + }); + return { + _success: false, + error: "Last report extracted does not match expected", + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; + } + + return { + _success: true, + logs: logger.getLogs(), + debugUrl, + sessionUrl, + }; +}; diff --git a/evals/text_extract/extract_snowshoeing_destinations.ts b/evals/text_extract/extract_snowshoeing_destinations.ts new file mode 100644 index 00000000..67a5b3e7 --- /dev/null +++ b/evals/text_extract/extract_snowshoeing_destinations.ts @@ -0,0 +1,94 @@ +import { z } from "zod"; +import { initStagehand } from "../utils"; +import { EvalFunction } from "../../types/evals"; + +export const extract_snowshoeing_destinations: EvalFunction = async ({ + modelName, + logger, + useTextExtract, +}) => { + const { stagehand, initResponse } = await initStagehand({ + modelName, + logger, + }); + + const { debugUrl, sessionUrl } = initResponse; + + try { + await stagehand.page.goto( + "https://www.cbisland.com/blog/10-snowshoeing-adventures-on-cape-breton-island/", + ); + + await stagehand.act({ action: "reject the cookies" }); + + const snowshoeing_regions = await stagehand.extract({ + instruction: + "Extract all the snowshoeing regions and the names of the trails within each region.", + schema: z.object({ + snowshoeing_regions: z.array( + z.object({ + region_name: z + .string() + .describe("The name of the snowshoeing region"), + trails: z + .array( + z.object({ + trail_name: z.string().describe("The name of the trail"), + }), + ) + .describe("The list of trails available in this region."), + }), + ), + }), + modelName, + useTextExtract, + }); + + logger.log({ + message: "Extracted destinations and trails", + level: 1, + auxiliary: { + destinations: { + value: JSON.stringify(snowshoeing_regions), + type: "object", + }, + }, + }); + + await stagehand.close(); + + const _success = snowshoeing_regions.snowshoeing_regions.length === 10; + + return { + _success, + snowshoeing_regions, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + logger.error({ + message: "Error in extract_snowshoeing_destinations function", + level: 0, + auxiliary: { + error: { + value: error.message, + type: "string", + }, + trace: { + value: error.stack, + type: "string", + }, + }, + }); + return { + _success: false, + error: JSON.parse(JSON.stringify(error, null, 2)), + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } finally { + await stagehand.context.close().catch(() => {}); + } +}; diff --git a/lib/prompt.ts b/lib/prompt.ts index 3dc3a4fd..37217993 100644 --- a/lib/prompt.ts +++ b/lib/prompt.ts @@ -201,23 +201,47 @@ export const actTools: Array = [ ]; // extract -const extractSystemPrompt = `You are extracting content on behalf of a user. You will be given: +export function buildExtractSystemPrompt( + isUsingPrintExtractedDataTool: boolean = false, + useTextExtract: boolean = true, +): ChatMessage { + const baseContent = `You are extracting content on behalf of a user. + If a user asks you to extract a 'list' of information, or 'all' information, + YOU MUST EXTRACT ALL OF THE INFORMATION THAT THE USER REQUESTS. + + You will be given: 1. An instruction -2. A list of DOM elements to extract from +2. `; + + const contentDetail = useTextExtract + ? `A text representation of a webpage to extract information from.` + : `A list of DOM elements to extract from.`; -Print the exact text from the DOM elements with all symbols, characters, and endlines as is. + const instructions = ` +Print the exact text from the ${ + useTextExtract ? "text-rendered webpage" : "DOM elements" + } with all symbols, characters, and endlines as is. Print null or an empty string if no new information is found. -`; + `.trim(); -export function buildExtractSystemPrompt( - isUsingPrintExtractedDataTool: boolean = false, -): ChatMessage { - let content = extractSystemPrompt.replace(/\s+/g, " "); - if (isUsingPrintExtractedDataTool) { - content += ` + const toolInstructions = isUsingPrintExtractedDataTool + ? ` ONLY print the content using the print_extracted_data tool provided. -ONLY print the content using the print_extracted_data tool provided.`; - } +ONLY print the content using the print_extracted_data tool provided. + `.trim() + : ""; + + const additionalInstructions = useTextExtract + ? `Once you are given the text-rendered webpage, + you must thoroughly and meticulously analyze it. Be very careful to ensure that you + do not miss any important information.` + : ""; + + const content = + `${baseContent}${contentDetail}\n\n${instructions}\n${toolInstructions}${ + additionalInstructions ? `\n\n${additionalInstructions}` : "" + }`.replace(/\s+/g, " "); + return { role: "system", content, diff --git a/types/evals.ts b/types/evals.ts index 01cca7f0..3b4589da 100644 --- a/types/evals.ts +++ b/types/evals.ts @@ -7,6 +7,7 @@ import { EvalCase } from "braintrust"; export type EvalFunction = (args: { modelName: AvailableModel; logger: EvalLogger; + useTextExtract: boolean; }) => Promise<{ _success: boolean; logs: LogLine[]; @@ -21,6 +22,7 @@ export const EvalCategorySchema = z.enum([ "combination", "extract", "experimental", + "text_extract", ]); export type EvalCategory = z.infer; diff --git a/types/stagehand.ts b/types/stagehand.ts index 58396573..284df989 100644 --- a/types/stagehand.ts +++ b/types/stagehand.ts @@ -70,6 +70,7 @@ export interface ExtractOptions { modelName?: AvailableModel; modelClientOptions?: ClientOptions; domSettleTimeoutMs?: number; + useTextExtract?: boolean; } export type ExtractResult = z.infer;