Evals for TextExtract (#303)

* move stockx to experimental * accept param for extraction method * add textextract specific evals * accept textextract param * fix instruction and accept textextract param * update extract system prompt. add additional prompt if using textextract * parse args before setting extract method * fix errors from merge * prettier * dont use ay * raise notimplementederror for textExtract * rename textextract to text_extract * add text_extract evals to CI * prettier * add useTextExtract to ExtractOptions * run both extract methods in CI * prettier * continue on error until textExtract is implemented * fix yml * rm extra space * fix indentation * prettier * increase time limit
browserbase · Dec 15, 2024 · d2b591d · d2b591d
1 parent cdaf3ea
commit d2b591d
Show file tree

Hide file tree

Showing 40 changed files with 1,361 additions and 86 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -14,7 +14,7 @@ on:
 env:
   EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest"
   EXPERIMENTAL_EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest,o1-mini,o1-preview"
-  EVAL_CATEGORIES: "observe,act,combination,extract,experimental"
+  EVAL_CATEGORIES: "observe,act,combination,extract,experimental,text_extract"
 
 concurrency:
   group: ${{ github.ref }}
@@ -67,7 +67,6 @@ jobs:
       BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
       HEADLESS: true
       EVAL_ENV: browserbase
-
     steps:
       - name: Check out repository code
         uses: actions/checkout@v4
@@ -83,29 +82,98 @@ jobs:
       - name: Install Playwright browsers
         run: npm exec playwright install --with-deps
 
-      - name: Run Extract Evals
-        run: npm run evals category extract
+      # Run extract category with domExtract
+      - name: Run Extract Evals (domExtract)
+        run: npm run evals category extract -- --extract-method=domExtract
+      - name: Save Extract Dom Results
+        run: mv eval-summary.json eval-summary-extract-dom.json
+
+      # Run extract category with textExtract
+      - name: Run Extract Evals (textExtract)
+        run: npm run evals category extract -- --extract-method=textExtract
+        continue-on-error: true
+      #      - name: Save Extract Text Results
+      #        run: mv eval-summary.json eval-summary-extract-text.json
 
-      - name: Log Extract Evals Performance
+      - name: Log and Compare Extract Evals Performance
         run: |
-          experimentName=$(jq -r '.experimentName' eval-summary.json)
-          echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
-          if [ -f eval-summary.json ]; then
-            extract_score=$(jq '.categories.extract' eval-summary.json)
-            echo "Extract category score: $extract_score%"
-            if (( $(echo "$extract_score < 80" | bc -l) )); then
-              echo "Extract category score is below 80%. Failing CI."
-              exit 1
-            fi
-          else
-            echo "Eval summary not found for extract category. Failing CI."
+          experimentNameDom=$(jq -r '.experimentName' eval-summary-extract-dom.json)
+          dom_score=$(jq '.categories.extract' eval-summary-extract-dom.json)
+          echo "DomExtract Extract category score: $dom_score%"
+          echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}"
+
+          # experimentNameText=$(jq -r '.experimentName' eval-summary-extract-text.json)
+          # text_score=$(jq '.categories.extract' eval-summary-extract-text.json)
+          # echo "TextExtract Extract category score: $text_score%"
+          # echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}"
+
+          if (( $(echo "$dom_score < 80" | bc -l) )); then
+            echo "DomExtract extract category score is below 80%. Failing CI."
             exit 1
           fi
 
+  run-text-extract-evals:
+    needs: [run-extract-evals]
+    runs-on: ubuntu-latest
+    timeout-minutes: 40
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
+      BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
+      BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
+      HEADLESS: true
+      EVAL_ENV: browserbase
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - name: Install dependencies
+        run: npm install --no-frozen-lockfile
+
+      - name: Install Playwright browsers
+        run: npm exec playwright install --with-deps
+
+      # Run text_extract category with domExtract
+      - name: Run text_extract Evals (domExtract)
+        run: npm run evals category text_extract -- --extract-method=domExtract
+      - name: Save text_extract Dom Results
+        run: mv eval-summary.json eval-summary-text_extract-dom.json
+
+      # Run text_extract category with textExtract
+      - name: Run text_extract Evals (textExtract)
+        run: npm run evals category text_extract -- --extract-method=textExtract
+        continue-on-error: true
+      #      - name: Save text_extract Text Results
+      #        run: mv eval-summary.json eval-summary-text_extract-text.json
+
+      - name: Log and Compare text_extract Evals Performance
+        run: |
+          experimentNameDom=$(jq -r '.experimentName' eval-summary-text_extract-dom.json)
+          dom_score=$(jq '.categories.text_extract' eval-summary-text_extract-dom.json)
+          echo "DomExtract text_extract category score: $dom_score%"
+          echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}"
+
+  #          experimentNameText=$(jq -r '.experimentName' eval-summary-text_extract-text.json)
+  #          text_score=$(jq '.categories.text_extract' eval-summary-text_extract-text.json)
+  #          echo "TextExtract text_extract category score: $text_score%"
+  #          echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}"
+
+  # Fail CI only if textExtract is below 80%
+  #          if (( $(echo "$text_score < 80" | bc -l) )); then
+  #            echo "textExtract text_extract category score is below 80%. Failing CI."
+  #            exit 1
+  #          fi
+
   run-act-evals:
     runs-on: ubuntu-latest
     timeout-minutes: 25
-    needs: [run-extract-evals]
+    needs: [run-text-extract-evals]
     env:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
       ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
@@ -198,7 +266,7 @@ jobs:
 
   run-combination-evals:
     runs-on: ubuntu-latest
-    timeout-minutes: 25
+    timeout-minutes: 40
     needs: [run-observe-evals]
     env:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -246,7 +314,7 @@ jobs:
   run-experimental-evals:
     runs-on: ubuntu-latest
     timeout-minutes: 120
-    needs: [run-combination-evals]
+    needs: [run-text-extract-evals]
     if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev'
     env:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

diff --git a/evals/combination/allrecipes.ts b/evals/combination/allrecipes.ts
@@ -2,7 +2,11 @@ import { initStagehand } from "../utils";
 import { EvalFunction } from "../../types/evals";
 import { z } from "zod";
 
-export const allrecipes: EvalFunction = async ({ modelName, logger }) => {
+export const allrecipes: EvalFunction = async ({
+  modelName,
+  logger,
+  useTextExtract,
+}) => {
   const { stagehand, initResponse } = await initStagehand({
     modelName,
     logger,
@@ -28,6 +32,7 @@ export const allrecipes: EvalFunction = async ({ modelName, logger }) => {
         .describe("Total number of ratings for the recipe"),
     }),
     modelName,
+    useTextExtract,
   });
 
   await stagehand.close();

diff --git a/evals/combination/arxiv.ts b/evals/combination/arxiv.ts
@@ -2,7 +2,11 @@ import { EvalFunction } from "../../types/evals";
 import { initStagehand } from "../utils";
 import { z } from "zod";
 
-export const arxiv: EvalFunction = async ({ modelName, logger }) => {
+export const arxiv: EvalFunction = async ({
+  modelName,
+  logger,
+  useTextExtract,
+}) => {
   const { stagehand, initResponse } = await initStagehand({
     modelName,
     logger,
@@ -29,7 +33,8 @@ export const arxiv: EvalFunction = async ({ modelName, logger }) => {
           )
           .describe("list of papers"),
       }),
-      modelName: "gpt-4o-2024-08-06",
+      modelName,
+      useTextExtract,
     });
 
     if (
@@ -86,7 +91,8 @@ export const arxiv: EvalFunction = async ({ modelName, logger }) => {
               )
               .nullable(),
           }),
-          modelName: "gpt-4o-2024-08-06",
+          modelName,
+          useTextExtract,
         });
 
         papers.push({

diff --git a/evals/combination/extract_collaborators.ts b/evals/combination/extract_collaborators.ts
@@ -5,6 +5,7 @@ import { z } from "zod";
 export const extract_collaborators: EvalFunction = async ({
   modelName,
   logger,
+  useTextExtract,
 }) => {
   const { stagehand, initResponse } = await initStagehand({
     modelName,
@@ -32,6 +33,7 @@ export const extract_collaborators: EvalFunction = async ({
         ),
       }),
       modelName,
+      useTextExtract,
     });
 
     await stagehand.close();

diff --git a/evals/combination/extract_github_commits.ts b/evals/combination/extract_github_commits.ts
@@ -5,6 +5,7 @@ import { z } from "zod";
 export const extract_github_commits: EvalFunction = async ({
   modelName,
   logger,
+  useTextExtract,
 }) => {
   const { stagehand, initResponse } = await initStagehand({
     modelName,
@@ -32,6 +33,7 @@ export const extract_github_commits: EvalFunction = async ({
         ),
       }),
       modelName,
+      useTextExtract,
     });
 
     logger.log({

diff --git a/evals/combination/imdb_movie_details.ts b/evals/combination/imdb_movie_details.ts
@@ -5,6 +5,7 @@ import { z } from "zod";
 export const imdb_movie_details: EvalFunction = async ({
   modelName,
   logger,
+  useTextExtract,
 }) => {
   const { stagehand, initResponse } = await initStagehand({
     modelName,
@@ -28,6 +29,7 @@ export const imdb_movie_details: EvalFunction = async ({
         .describe("List of countries with the most ratings"),
     }),
     modelName,
+    useTextExtract,
   });
 
   await stagehand.close();

diff --git a/evals/combination/peeler_complex.ts b/evals/combination/peeler_complex.ts
@@ -2,7 +2,11 @@ import { EvalFunction } from "../../types/evals";
 import { initStagehand } from "../utils";
 import { z } from "zod";
 
-export const peeler_complex: EvalFunction = async ({ modelName, logger }) => {
+export const peeler_complex: EvalFunction = async ({
+  modelName,
+  logger,
+  useTextExtract,
+}) => {
   const { stagehand, initResponse } = await initStagehand({
     modelName,
     logger,
@@ -27,7 +31,8 @@ export const peeler_complex: EvalFunction = async ({ modelName, logger }) => {
     const { price } = await stagehand.extract({
       instruction: "get the price of the peeler",
       schema: z.object({ price: z.number().nullable() }),
-      modelName: "gpt-4o-2024-08-06",
+      modelName,
+      useTextExtract,
     });
 
     await stagehand.close();

diff --git a/evals/combination/sciquest.ts b/evals/combination/sciquest.ts
@@ -2,7 +2,11 @@ import { initStagehand } from "../utils";
 import { EvalFunction } from "../../types/evals";
 import { z } from "zod";
 
-export const sciquest: EvalFunction = async ({ modelName, logger }) => {
+export const sciquest: EvalFunction = async ({
+  modelName,
+  logger,
+  useTextExtract,
+}) => {
   const { stagehand, initResponse } = await initStagehand({
     modelName,
     logger,
@@ -25,6 +29,7 @@ export const sciquest: EvalFunction = async ({ modelName, logger }) => {
       total_results: z.string(),
     }),
     modelName,
+    useTextExtract,
   });
 
   await stagehand.close();

diff --git a/evals/experimental/combination_sauce.ts b/evals/experimental/combination_sauce.ts
@@ -5,6 +5,7 @@ import { z } from "zod";
 export const combination_sauce: EvalFunction = async ({
   modelName,
   logger,
+  useTextExtract,
 }) => {
   const { stagehand, initResponse } = await initStagehand({
     modelName,
@@ -22,6 +23,8 @@ export const combination_sauce: EvalFunction = async ({
         usernames: z.array(z.string()).describe("the accepted usernames"),
         password: z.string().describe("the password for login"),
       }),
+      modelName,
+      useTextExtract,
     });
 
     await stagehand.act({

diff --git a/evals/experimental/costar.ts b/evals/experimental/costar.ts
@@ -2,7 +2,11 @@ import { initStagehand } from "../utils";
 import { EvalFunction } from "../../types/evals";
 import { z } from "zod";
 
-export const costar: EvalFunction = async ({ modelName, logger }) => {
+export const costar: EvalFunction = async ({
+  modelName,
+  logger,
+  useTextExtract,
+}) => {
   const { stagehand, initResponse } = await initStagehand({
     modelName,
     logger,
@@ -29,7 +33,8 @@ export const costar: EvalFunction = async ({ modelName, logger }) => {
       schema: z.object({
         title: z.string().describe("the title of the article").nullable(),
       }),
-      modelName: "gpt-4o-2024-08-06",
+      modelName,
+      useTextExtract,
     });
 
     logger.log({

diff --git a/evals/experimental/extract_aigrant_companies.ts b/evals/experimental/extract_aigrant_companies.ts
@@ -5,6 +5,7 @@ import { EvalFunction } from "../../types/evals";
 export const extract_aigrant_companies: EvalFunction = async ({
   modelName,
   logger,
+  useTextExtract,
 }) => {
   const { stagehand, initResponse } = await initStagehand({
     modelName,
@@ -26,6 +27,8 @@ export const extract_aigrant_companies: EvalFunction = async ({
         }),
       ),
     }),
+    modelName,
+    useTextExtract,
   });
 
   await stagehand.close();

diff --git a/evals/experimental/extract_capacitor_info.ts b/evals/experimental/extract_capacitor_info.ts
@@ -6,6 +6,7 @@ import { z } from "zod";
 export const extract_capacitor_info: EvalFunction = async ({
   modelName,
   logger,
+  useTextExtract,
 }) => {
   const { stagehand, initResponse } = await initStagehand({
     modelName,
@@ -27,6 +28,7 @@ export const extract_capacitor_info: EvalFunction = async ({
       min_operating_temp: z.string(),
     }),
     modelName,
+    useTextExtract,
   });
 
   await stagehand.close();

diff --git a/evals/experimental/extract_partners.ts b/evals/experimental/extract_partners.ts
@@ -2,7 +2,11 @@ import { EvalFunction } from "../../types/evals";
 import { initStagehand } from "../utils";
 import { z } from "zod";
 
-export const extract_partners: EvalFunction = async ({ modelName, logger }) => {
+export const extract_partners: EvalFunction = async ({
+  modelName,
+  logger,
+  useTextExtract,
+}) => {
   const { stagehand, initResponse } = await initStagehand({
     modelName,
     logger,
@@ -40,6 +44,8 @@ export const extract_partners: EvalFunction = async ({ modelName, logger }) => {
           .optional()
           .describe("Any explanation about partner listing or absence thereof"),
       }),
+      modelName,
+      useTextExtract,
     });
 
     const expectedPartners = [