``` ├── .changeset/ ├── breezy-carpets-press.md ├── config.json ├── cyan-symbols-double.md ├── empty-bugs-occur.md ├── fifty-cats-sell.md ├── floppy-experts-wash.md ├── green-signs-live.md ├── loose-cooks-lead.md ├── short-banks-sit.md ├── solid-rice-admire.md ├── sweet-glasses-hope.md ├── thick-buckets-act.md ├── tricky-nights-remain.md ├── vast-vans-crash.md ├── whole-yaks-cheat.md ├── .cursorrules ├── .env.example ├── .github/ ├── pull_request_template ├── workflows/ ├── ci.yml ├── release.yml ├── .gitignore ├── .prettierignore ├── .prettierrc ├── .vscode/ ├── settings.json ├── CHANGELOG.md ├── LICENSE ├── README.md ├── docs/ ├── logging.md ├── media/ ├── chunks.png ├── stagehand-playwright.png ├── release.md ├── eslint.config.mjs ├── evals/ ├── args.ts ├── assets/ ├── cart.html ├── peeler.html ├── deterministic/ ├── auxiliary/ ├── logo.png ├── bb.playwright.config.ts ├── e2e.playwright.config.ts ├── local.playwright.config.ts ├── stagehand.config.ts ├── tests/ ├── BrowserContext/ ├── addInitScript.test.ts ├── cookies.test.ts ├── multiPage.test.ts ├── page.test.ts ├── routing.test.ts ├── Errors/ ├── apiKeyError.test.ts ├── browserbase/ ├── contexts.test.ts ├── downloads.test.ts ├── sessions.test.ts ├── uploads.test.ts ├── local/ ├── create.test.ts ├── page/ ├── addInitScript.test.ts ├── addRemoveLocatorHandler.test.ts ├── addTags.test.ts ├── bringToFront.test.ts ├── content.test.ts ├── evaluate.test.ts ├── expose.test.ts ├── frames.test.ts ├── getBy.test.ts ├── navigation.test.ts ├── on.test.ts ├── pageContext.test.ts ├── reload.test.ts ├── waitFor.test.ts ├── env.ts ├── evals.config.json ├── evaluator.ts ├── index.eval.ts ├── initStagehand.ts ├── llm_clients/ ├── hn_aisdk.ts ├── hn_customOpenAI.ts ├── hn_langchain.ts ├── logger.ts ├── scoring.ts ├── taskConfig.ts ├── tasks/ ├── agent/ ├── google_flights.ts ├── iframe_form.ts ├── iframe_form_multiple.ts ├── sf_library_card.ts ├── sf_library_card_multiple.ts ├── allrecipes.ts ├── amazon_add_to_cart.ts ├── apple.ts ├── arxiv.ts ├── bidnet.ts ├── checkboxes.ts ├── combination_sauce.ts ``` ## /.changeset/breezy-carpets-press.md --- "@browserbasehq/stagehand": patch --- remove unused fillInVariables fn ## /.changeset/config.json ```json path="/.changeset/config.json" { "$schema": "https://unpkg.com/@changesets/config@2.1.1/schema.json", "commit": false, "fixed": [], "linked": [], "baseBranch": "main", "updateInternalDependencies": "patch", "access": "public", "changelog": [ "@changesets/changelog-github", { "repo": "browserbase/stagehand" } ], "snapshot": { "useCalculatedVersion": true, "prereleaseTemplate": "alpha-{commit}", "tag": "alpha" } } ``` ## /.changeset/cyan-symbols-double.md --- "@browserbasehq/stagehand": patch --- Updated the list of OpenAI models (4.1, o3...) ## /.changeset/empty-bugs-occur.md --- "@browserbasehq/stagehand": patch --- wrap page.evaluate to make sure we have injected browser side scripts before calling them ## /.changeset/fifty-cats-sell.md --- "@browserbasehq/stagehand": minor --- extract links ## /.changeset/floppy-experts-wash.md --- "@browserbasehq/stagehand": patch --- remove unnecessary log ## /.changeset/green-signs-live.md --- "@browserbasehq/stagehand": patch --- use javsacript click instead of playwright ## /.changeset/loose-cooks-lead.md --- "@browserbasehq/stagehand": patch --- Fixed removing a hanging observation map that is no longer used ## /.changeset/short-banks-sit.md --- "@browserbasehq/stagehand": patch --- Fixed the schema input for Gemini's response model ## /.changeset/solid-rice-admire.md --- "@browserbasehq/stagehand": minor --- Added Gemini 2.5 Flash to Google supported models ## /.changeset/sweet-glasses-hope.md --- "@browserbasehq/stagehand": patch --- allow form filling when form is not top-most element ## /.changeset/thick-buckets-act.md --- "@browserbasehq/stagehand": patch --- add telemetry for cua agents to stagehand.metrics ## /.changeset/tricky-nights-remain.md --- "@browserbasehq/stagehand": patch --- rm deprecated primitives from stagehand object ## /.changeset/vast-vans-crash.md --- "@browserbasehq/stagehand": patch --- Fixes a redundant unnecessary log ## /.changeset/whole-yaks-cheat.md --- "@browserbasehq/stagehand": minor --- Added a new class - Stagehand Evaluator - that wraps around a Stagehand object to determine whether a task is successful or not. Currently used for agent evals ## /.cursorrules ```cursorrules path="/.cursorrules" # Stagehand Project This is a project that uses Stagehand, which amplifies Playwright with `act`, `extract`, and `observe` added to the Page class. `Stagehand` is a class that provides config, a `StagehandPage` object via `stagehand.page`, and a `StagehandContext` object via `stagehand.context`. `Page` is a class that extends the Playwright `Page` class and adds `act`, `extract`, and `observe` methods. `Context` is a class that extends the Playwright `BrowserContext` class. Use the following rules to write code for this project. - To take an action on the page like "click the sign in button", use Stagehand `act` like this: \`\`\`typescript await page.act("Click the sign in button"); \`\`\` - To plan an instruction before taking an action, use Stagehand `observe` to get the action to execute. \`\`\`typescript const [action] = await page.observe("Click the sign in button"); \`\`\` - The result of `observe` is an array of `ObserveResult` objects that can directly be used as params for `act` like this: \`\`\`typescript const [action] = await page.observe("Click the sign in button"); await page.act(action); \`\`\` - When writing code that needs to extract data from the page, use Stagehand `extract`. Explicitly pass the following params by default: \`\`\`typescript const { someValue } = await page.extract({ instruction: the instruction to execute, schema: z.object({ someValue: z.string(), }), // The schema to extract }); \`\`\` ## Initialize \`\`\`typescript import { Stagehand } from "@browserbasehq/stagehand"; import StagehandConfig from "./stagehand.config"; const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; // Playwright Page with act, extract, and observe methods const context = stagehand.context; // Playwright BrowserContext \`\`\` ## Act You can cache the results of `observe` and use them as params for `act` like this: \`\`\`typescript const instruction = "Click the sign in button"; const cachedAction = await getCache(instruction); if (cachedAction) { await page.act(cachedAction); } else { try { const results = await page.observe(instruction); await setCache(instruction, results); await page.act(results[0]); } catch (error) { await page.act(instruction); // If the action is not cached, execute the instruction directly } } \`\`\` Be sure to cache the results of `observe` and use them as params for `act` to avoid unexpected DOM changes. Using `act` without caching will result in more unpredictable behavior. Act `action` should be as atomic and specific as possible, i.e. "Click the sign in button" or "Type 'hello' into the search input". AVOID actions that are more than one step, i.e. "Order me pizza" or "Type in the search bar and hit enter". ## Extract If you are writing code that needs to extract data from the page, use Stagehand `extract`. \`\`\`typescript const signInButtonText = await page.extract("extract the sign in button text"); \`\`\` You can also pass in params like an output schema in Zod, and a flag to use text extraction: \`\`\`typescript const data = await page.extract({ instruction: "extract the sign in button text", schema: z.object({ text: z.string(), }), }); \`\`\` `schema` is a Zod schema that describes the data you want to extract. To extract an array, make sure to pass in a single object that contains the array, as follows: \`\`\`typescript const data = await page.extract({ instruction: "extract the text inside all buttons", schema: z.object({ text: z.array(z.string()), }), useTextExtract: true, // Set true for larger-scale extractions (multiple paragraphs), or set false for small extractions (name, birthday, etc) }); \`\`\` ## Agent Use the `agent` method to automonously execute larger tasks like "Get the stock price of NVDA" \`\`\`typescript // Navigate to a website await stagehand.page.goto("https://www.google.com"); const agent = stagehand.agent({ // You can use either OpenAI or Anthropic provider: "openai", // The model to use (claude-3-7-sonnet-20250219 or claude-3-5-sonnet-20240620 for Anthropic) model: "computer-use-preview", // Customize the system prompt instructions: `You are a helpful assistant that can use a web browser. Do not ask follow up questions, the user will trust your judgement.`, // Customize the API key options: { apiKey: process.env.OPENAI_API_KEY, }, }); // Execute the agent await agent.execute( "Apply for a library card at the San Francisco Public Library" ); \`\`\` ``` ## /.env.example ```example path="/.env.example" OPENAI_API_KEY="" CEREBRAS_API_KEY="" GROQ_API_KEY="" BROWSERBASE_API_KEY="" BRAINTRUST_API_KEY="" ANTHROPIC_API_KEY="" HEADLESS=false ENABLE_CACHING=false EVAL_MODELS="gpt-4o,claude-3-5-sonnet-latest" EXPERIMENTAL_EVAL_MODELS="gpt-4o,claude-3-5-sonnet-latest,o1-mini,o1-preview" EVAL_CATEGORIES="observe,act,combination,extract,experimental" STAGEHAND_API_URL="http://localhost:80" ``` ## /.github/pull_request_template ```github/pull_request_template path="/.github/pull_request_template" # why # what changed # test plan ``` ## /.github/workflows/ci.yml ```yml path="/.github/workflows/ci.yml" name: Evals on: pull_request: types: - opened - synchronize - labeled - unlabeled env: EVAL_MODELS: "gpt-4.1,gemini-2.0-flash,claude-3-5-sonnet-latest" EVAL_CATEGORIES: "observe,act,combination,extract,text_extract,targeted_extract" concurrency: group: ${{ github.ref }} cancel-in-progress: true jobs: determine-evals: runs-on: ubuntu-latest outputs: run-combination: ${{ steps.check-labels.outputs.run-combination }} run-extract: ${{ steps.check-labels.outputs.run-extract }} run-act: ${{ steps.check-labels.outputs.run-act }} run-observe: ${{ steps.check-labels.outputs.run-observe }} run-text-extract: ${{ steps.check-labels.outputs.run-text-extract }} run-targeted-extract: ${{ steps.check-labels.outputs.run-targeted-extract }} steps: - id: check-labels run: | # Default to running all tests on main branch if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then echo "Running all tests for main branch" echo "run-combination=true" >> $GITHUB_OUTPUT echo "run-extract=true" >> $GITHUB_OUTPUT echo "run-act=true" >> $GITHUB_OUTPUT echo "run-observe=true" >> $GITHUB_OUTPUT echo "run-text-extract=true" >> $GITHUB_OUTPUT echo "run-targeted-extract=true" >> $GITHUB_OUTPUT exit 0 fi # Check for specific labels echo "run-combination=${{ contains(github.event.pull_request.labels.*.name, 'combination') }}" >> $GITHUB_OUTPUT echo "run-extract=${{ contains(github.event.pull_request.labels.*.name, 'extract') }}" >> $GITHUB_OUTPUT echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT echo "run-text-extract=${{ contains(github.event.pull_request.labels.*.name, 'text-extract') }}" >> $GITHUB_OUTPUT echo "run-targeted-extract=${{ contains(github.event.pull_request.labels.*.name, 'targeted-extract') }}" >> $GITHUB_OUTPUT run-lint: runs-on: ubuntu-latest steps: - name: Check out repository code uses: actions/checkout@v4 - name: Set up Node.js uses: actions/setup-node@v4 with: node-version: "20" - name: Install dependencies run: | rm -rf node_modules rm -f package-lock.json npm install - name: Run Lint run: npm run lint run-build: runs-on: ubuntu-latest steps: - name: Check out repository code uses: actions/checkout@v4 - name: Set up Node.js uses: actions/setup-node@v4 with: node-version: "20" - name: Install dependencies run: | rm -rf node_modules rm -f package-lock.json npm install - name: Run Build run: npm run build run-e2e-tests: needs: [run-lint, run-build] runs-on: ubuntu-latest timeout-minutes: 50 env: HEADLESS: true steps: - name: Check out repository code uses: actions/checkout@v4 - name: Set up Node.js uses: actions/setup-node@v4 with: node-version: "20" - name: Install dependencies run: | rm -rf node_modules rm -f package-lock.json npm install - name: Install Playwright browsers run: npm exec playwright install --with-deps - name: Build Stagehand run: npm run build - name: Run E2E Tests (Deterministic Playwright) run: npm run e2e run-e2e-local-tests: needs: [run-lint, run-build] runs-on: ubuntu-latest timeout-minutes: 50 env: HEADLESS: true steps: - name: Check out repository code uses: actions/checkout@v4 - name: Set up Node.js uses: actions/setup-node@v4 with: node-version: "20" - name: Install dependencies run: | rm -rf node_modules rm -f package-lock.json npm install - name: Install Playwright browsers run: npm exec playwright install --with-deps - name: Build Stagehand run: npm run build - name: Run local E2E Tests (Deterministic Playwright) run: npm run e2e:local run-e2e-bb-tests: needs: [run-lint, run-build] runs-on: ubuntu-latest timeout-minutes: 50 if: > github.event_name == 'push' || (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} HEADLESS: true steps: - name: Check out repository code uses: actions/checkout@v4 - name: Set up Node.js uses: actions/setup-node@v4 with: node-version: "20" - name: Install dependencies run: | rm -rf node_modules rm -f package-lock.json npm install - name: Install Playwright browsers run: npm exec playwright install --with-deps - name: Build Stagehand run: npm run build - name: Run E2E Tests (browserbase) run: npm run e2e:bb run-regression-evals: needs: [run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals] runs-on: ubuntu-latest timeout-minutes: 9 outputs: regression_score: ${{ steps.set-regression-score.outputs.regression_score }} env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} HEADLESS: true EVAL_ENV: browserbase steps: - name: Check out repository code uses: actions/checkout@v4 - name: Set up Node.js uses: actions/setup-node@v4 with: node-version: "20" - name: Install dependencies run: | rm -rf node_modules rm -f package-lock.json npm install - name: Build Stagehand run: npm run build - name: Install Playwright browsers run: npm exec playwright install --with-deps - name: Run Regression Evals run: npm run evals category regression trials=2 concurrency=20 env=BROWSERBASE - name: Log Regression Evals Performance run: | experimentName=$(jq -r '.experimentName' eval-summary.json) echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" if [ -f eval-summary.json ]; then regression_score=$(jq '.categories.regression' eval-summary.json) echo "Regression category score: $regression_score%" if (( $(echo "$regression_score < 90" | bc -l) )); then echo "Regression category score is below 90%. Failing CI." exit 1 fi else echo "Eval summary not found for regression category. Failing CI." exit 1 fi run-combination-evals: needs: [run-regression-evals, determine-evals] runs-on: ubuntu-latest timeout-minutes: 40 env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} HEADLESS: true EVAL_ENV: browserbase steps: - name: Check out repository code uses: actions/checkout@v4 - name: Check for 'combination' label id: label-check run: | if [ "${{ needs.determine-evals.outputs.run-combination }}" != "true" ]; then echo "has_label=false" >> $GITHUB_OUTPUT echo "No label for COMBINATION. Exiting with success." else echo "has_label=true" >> $GITHUB_OUTPUT fi - name: Set up Node.js if: needs.determine-evals.outputs.run-combination == 'true' uses: actions/setup-node@v4 with: node-version: "20" - name: Install dependencies if: needs.determine-evals.outputs.run-combination == 'true' run: | rm -rf node_modules rm -f package-lock.json npm install - name: Build Stagehand if: needs.determine-evals.outputs.run-combination == 'true' run: npm run build - name: Install Playwright browsers if: needs.determine-evals.outputs.run-combination == 'true' run: npm exec playwright install --with-deps - name: Run Combination Evals if: needs.determine-evals.outputs.run-combination == 'true' run: npm run evals category combination - name: Log Combination Evals Performance if: needs.determine-evals.outputs.run-combination == 'true' run: | experimentName=$(jq -r '.experimentName' eval-summary.json) echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" if [ -f eval-summary.json ]; then combination_score=$(jq '.categories.combination' eval-summary.json) echo "Combination category score: $combination_score%" exit 0 else echo "Eval summary not found for combination category. Failing CI." exit 1 fi run-act-evals: needs: [run-combination-evals, determine-evals] runs-on: ubuntu-latest timeout-minutes: 25 env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} HEADLESS: true EVAL_ENV: browserbase steps: - name: Check out repository code uses: actions/checkout@v4 - name: Check for 'act' label id: label-check run: | if [ "${{ needs.determine-evals.outputs.run-act }}" != "true" ]; then echo "has_label=false" >> $GITHUB_OUTPUT echo "No label for ACT. Exiting with success." else echo "has_label=true" >> $GITHUB_OUTPUT fi - name: Set up Node.js if: needs.determine-evals.outputs.run-act == 'true' uses: actions/setup-node@v4 with: node-version: "20" - name: Install dependencies if: needs.determine-evals.outputs.run-act == 'true' run: | rm -rf node_modules rm -f package-lock.json npm install - name: Build Stagehand if: needs.determine-evals.outputs.run-act == 'true' run: npm run build - name: Install Playwright browsers if: needs.determine-evals.outputs.run-act == 'true' run: npm exec playwright install --with-deps - name: Run Act Evals if: needs.determine-evals.outputs.run-act == 'true' run: npm run evals category act - name: Log Act Evals Performance if: needs.determine-evals.outputs.run-act == 'true' run: | experimentName=$(jq -r '.experimentName' eval-summary.json) echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" if [ -f eval-summary.json ]; then act_score=$(jq '.categories.act' eval-summary.json) echo "Act category score: $act_score%" if (( $(echo "$act_score < 80" | bc -l) )); then echo "Act category score is below 80%. Failing CI." exit 1 fi else echo "Eval summary not found for act category. Failing CI." exit 1 fi run-extract-evals: needs: [run-act-evals, determine-evals] runs-on: ubuntu-latest timeout-minutes: 50 env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} HEADLESS: true EVAL_ENV: browserbase steps: - name: Check out repository code uses: actions/checkout@v4 - name: Check for 'extract' label id: label-check run: | if [ "${{ needs.determine-evals.outputs.run-extract }}" != "true" ]; then echo "has_label=false" >> $GITHUB_OUTPUT echo "No label for EXTRACT. Exiting with success." else echo "has_label=true" >> $GITHUB_OUTPUT fi - name: Set up Node.js if: needs.determine-evals.outputs.run-extract == 'true' uses: actions/setup-node@v4 with: node-version: "20" - name: Install dependencies if: needs.determine-evals.outputs.run-extract == 'true' run: | rm -rf node_modules rm -f package-lock.json npm install - name: Build Stagehand if: needs.determine-evals.outputs.run-extract == 'true' run: npm run build - name: Install Playwright browsers if: needs.determine-evals.outputs.run-extract == 'true' run: npm exec playwright install --with-deps # 1. Run extract category with domExtract - name: Run Extract Evals (domExtract) if: needs.determine-evals.outputs.run-extract == 'true' run: npm run evals category extract -- --extract-method=domExtract - name: Save Extract Dom Results if: needs.determine-evals.outputs.run-extract == 'true' run: mv eval-summary.json eval-summary-extract-dom.json # 2. Then run extract category with textExtract - name: Run Extract Evals (textExtract) if: needs.determine-evals.outputs.run-extract == 'true' run: npm run evals category extract -- --extract-method=textExtract - name: Save Extract Text Results if: needs.determine-evals.outputs.run-extract == 'true' run: mv eval-summary.json eval-summary-extract-text.json # 3. Log and Compare Extract Evals Performance - name: Log and Compare Extract Evals Performance if: needs.determine-evals.outputs.run-extract == 'true' run: | experimentNameDom=$(jq -r '.experimentName' eval-summary-extract-dom.json) dom_score=$(jq '.categories.extract' eval-summary-extract-dom.json) echo "DomExtract Extract category score: $dom_score%" echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}" experimentNameText=$(jq -r '.experimentName' eval-summary-extract-text.json) text_score=$(jq '.categories.extract' eval-summary-extract-text.json) echo "TextExtract Extract category score: $text_score%" echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}" # If domExtract <80% fail CI if (( $(echo "$dom_score < 80" | bc -l) )); then echo "DomExtract extract category score is below 80%. Failing CI." exit 1 fi run-text-extract-evals: needs: [run-extract-evals, determine-evals] runs-on: ubuntu-latest timeout-minutes: 120 env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} HEADLESS: true EVAL_ENV: browserbase steps: - name: Check out repository code uses: actions/checkout@v4 - name: Check for 'text-extract' label id: label-check run: | if [ "${{ needs.determine-evals.outputs.run-text-extract }}" != "true" ]; then echo "has_label=false" >> $GITHUB_OUTPUT echo "No label for TEXT-EXTRACT. Exiting with success." else echo "has_label=true" >> $GITHUB_OUTPUT fi - name: Set up Node.js if: needs.determine-evals.outputs.run-text-extract == 'true' uses: actions/setup-node@v4 with: node-version: "20" - name: Install dependencies if: needs.determine-evals.outputs.run-text-extract == 'true' run: | rm -rf node_modules rm -f package-lock.json npm install - name: Install Playwright browsers if: needs.determine-evals.outputs.run-text-extract == 'true' run: npm exec playwright install --with-deps - name: Build Stagehand if: needs.determine-evals.outputs.run-text-extract == 'true' run: npm run build - name: Run text_extract Evals (textExtract) if: needs.determine-evals.outputs.run-text-extract == 'true' run: npm run evals category text_extract -- --extract-method=textExtract - name: Save text_extract Results if: needs.determine-evals.outputs.run-text-extract == 'true' run: mv eval-summary.json eval-summary-text_extract-text.json - name: Log text_extract Evals Performance if: needs.determine-evals.outputs.run-text-extract == 'true' run: | experimentNameText=$(jq -r '.experimentName' eval-summary-text_extract-text.json) text_score=$(jq '.categories.text_extract' eval-summary-text_extract-text.json) echo "TextExtract text_extract category score: $text_score%" echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}" # If text_score <80% fail CI if (( $(echo "$text_score < 80" | bc -l) )); then echo "textExtract text_extract category score is below 80%. Failing CI." exit 1 fi run-observe-evals: needs: [run-text-extract-evals, determine-evals] runs-on: ubuntu-latest timeout-minutes: 60 env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} HEADLESS: true EVAL_ENV: browserbase steps: - name: Check out repository code uses: actions/checkout@v4 - name: Check for 'observe' label id: label-check run: | if [ "${{ needs.determine-evals.outputs.run-observe }}" != "true" ]; then echo "has_label=false" >> $GITHUB_OUTPUT echo "No label for OBSERVE. Exiting with success." else echo "has_label=true" >> $GITHUB_OUTPUT fi - name: Set up Node.js if: needs.determine-evals.outputs.run-observe == 'true' uses: actions/setup-node@v4 with: node-version: "20" - name: Install dependencies if: needs.determine-evals.outputs.run-observe == 'true' run: | rm -rf node_modules rm -f package-lock.json npm install - name: Install Playwright browsers if: needs.determine-evals.outputs.run-observe == 'true' run: npm exec playwright install --with-deps - name: Build Stagehand if: needs.determine-evals.outputs.run-observe == 'true' run: npm run build - name: Run Observe Evals if: needs.determine-evals.outputs.run-observe == 'true' run: npm run evals category observe - name: Log Observe Evals Performance if: needs.determine-evals.outputs.run-observe == 'true' run: | experimentName=$(jq -r '.experimentName' eval-summary.json) echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" if [ -f eval-summary.json ]; then observe_score=$(jq '.categories.observe' eval-summary.json) echo "Observe category score: $observe_score%" if (( $(echo "$observe_score < 80" | bc -l) )); then echo "Observe category score is below 80%. Failing CI." exit 1 fi else echo "Eval summary not found for observe category. Failing CI." exit 1 fi run-targeted-extract-evals: needs: [run-observe-evals, determine-evals] runs-on: ubuntu-latest timeout-minutes: 60 env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} HEADLESS: true EVAL_ENV: browserbase steps: - name: Check out repository code uses: actions/checkout@v4 - name: Check for 'targeted-extract' label id: label-check run: | if [ "${{ needs.determine-evals.outputs.run-targeted-extract }}" != "true" ]; then echo "has_label=false" >> $GITHUB_OUTPUT echo "No label for TARGETED-EXTRACT. Exiting with success." else echo "has_label=true" >> $GITHUB_OUTPUT fi - name: Set up Node.js if: needs.determine-evals.outputs.run-targeted-extract == 'true' uses: actions/setup-node@v4 with: node-version: "20" - name: Install dependencies if: needs.determine-evals.outputs.run-targeted-extract == 'true' run: | rm -rf node_modules rm -f package-lock.json npm install - name: Install Playwright browsers if: needs.determine-evals.outputs.run-targeted-extract == 'true' run: npm exec playwright install --with-deps - name: Build Stagehand if: needs.determine-evals.outputs.run-targeted-extract == 'true' run: npm run build - name: Run targeted extract Evals if: needs.determine-evals.outputs.run-targeted-extract == 'true' run: npm run evals category targeted_extract -- --extract-method=textExtract - name: Log targeted extract Evals Performance if: needs.determine-evals.outputs.run-targeted-extract == 'true' run: | experimentName=$(jq -r '.experimentName' eval-summary.json) echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" if [ -f eval-summary.json ]; then targeted_extract_score=$(jq '.categories.targeted_extract' eval-summary.json) echo "Targeted extract category score: $targeted_extract_score%" if (( $(echo "$targeted_extract_score < 80" | bc -l) )); then echo "Targeted extract score is below 80%. Failing CI." exit 1 fi else echo "Eval summary not found for targeted_extract category. Failing CI." exit 1 fi ``` ## /.github/workflows/release.yml ```yml path="/.github/workflows/release.yml" name: Release on: push: branches: - main permissions: contents: write pull-requests: write concurrency: ${{ github.workflow }}-${{ github.ref }} jobs: release: name: Release runs-on: ubuntu-latest steps: - name: Checkout Repo uses: actions/checkout@v3 - name: Setup Node.js 20.x uses: actions/setup-node@v3 with: node-version: 20.x registry-url: "https://registry.npmjs.org" - name: Install dependencies run: | rm -rf node_modules rm -f package-lock.json npm install - name: Build run: npm run build - name: Create Release Pull Request or Publish to npm id: changesets uses: changesets/action@v1 with: publish: npm run release env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} - name: Publish Canary if: github.ref == 'refs/heads/main' run: | npm config set //registry.npmjs.org/:_authToken=${NODE_AUTH_TOKEN} git checkout main npm run release-canary env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} ``` ## /.gitignore ```gitignore path="/.gitignore" node_modules/ /test-results/ /playwright-report/ /blob-report/ /playwright/.cache/ screenshot.png .DS_STORE .cache/ .env downloads/ dist/ evals/**/public lib/dom/build/ evals/public *.tgz evals/playground.ts tmp/ eval-summary.json pnpm-lock.yaml evals/deterministic/tests/BrowserContext/tmp-test.har ``` ## /.prettierignore ```prettierignore path="/.prettierignore" pnpm-lock.yaml README.md **/*.json ``` ## /.prettierrc ```prettierrc path="/.prettierrc" {} ``` ## /.vscode/settings.json ```json path="/.vscode/settings.json" { "editor.defaultFormatter": "esbenp.prettier-vscode", "editor.formatOnSave": true } ``` ## /CHANGELOG.md # @browserbasehq/stagehand ## 2.1.0 ### Minor Changes - [#659](https://github.com/browserbase/stagehand/pull/659) [`f9a435e`](https://github.com/browserbase/stagehand/commit/f9a435e938daccfb2e54ca23fad8ef75128a4486) Thanks [@miguelg719](https://github.com/miguelg719)! - Added native support for Google Generative models (Gemini) ### Patch Changes - [#647](https://github.com/browserbase/stagehand/pull/647) [`ca5467d`](https://github.com/browserbase/stagehand/commit/ca5467de7d31bfb270b6b625224a926c52c97900) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - collapse redundant text nodes into parent elements - [#636](https://github.com/browserbase/stagehand/pull/636) [`9037430`](https://github.com/browserbase/stagehand/commit/903743097367ba6bb12baa9f0fa8f7985f543fdc) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix token act metrics and inference logging being misplaced as observe metrics and inference logging - [#648](https://github.com/browserbase/stagehand/pull/648) [`169e7ea`](https://github.com/browserbase/stagehand/commit/169e7ea9e229503ae5958eaa4511531578ee3841) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add mapping of node id -> url - [#654](https://github.com/browserbase/stagehand/pull/654) [`57a9853`](https://github.com/browserbase/stagehand/commit/57a98538381e0e54fbb734b43c50d61fd0d567df) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix repeated up & down scrolling bug for clicks inside `act` - [#624](https://github.com/browserbase/stagehand/pull/624) [`cf167a4`](https://github.com/browserbase/stagehand/commit/cf167a437865e8e8bdb8739d22c3b3bb84e185de) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - export stagehand error classes so they can be referenced from @dist - [#640](https://github.com/browserbase/stagehand/pull/640) [`178f5f0`](https://github.com/browserbase/stagehand/commit/178f5f0a8fecd876adfb4e29983853bdf7ec72fd) Thanks [@yash1744](https://github.com/yash1744)! - Added support for stagehand agents to automatically redirect to https://google.com when the page URL is empty or set to about:blank, preventing empty screenshots and saving tokens. - [#661](https://github.com/browserbase/stagehand/pull/661) [`bf823a3`](https://github.com/browserbase/stagehand/commit/bf823a36930b0686b416a42302ef8c021b4aba75) Thanks [@kamath](https://github.com/kamath)! - fix press enter - [#633](https://github.com/browserbase/stagehand/pull/633) [`86724f6`](https://github.com/browserbase/stagehand/commit/86724f6fb0abc7292423ac5bd0bebcd352f95940) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix the getBrowser logic for redundant api calls and throw informed errors - [#656](https://github.com/browserbase/stagehand/pull/656) [`c630373`](https://github.com/browserbase/stagehand/commit/c630373dede4c775875834bfb860436ba2ea48d2) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - parse out % signs from variables in act - [#637](https://github.com/browserbase/stagehand/pull/637) [`944bbbf`](https://github.com/browserbase/stagehand/commit/944bbbfe8bfb357b4910584447a93f6f402c3826) Thanks [@kamath](https://github.com/kamath)! - Fix: forward along the stack trace in StagehandDefaultError ## 2.0.0 ### Major Changes - [#591](https://github.com/browserbase/stagehand/pull/591) [`e234a0f`](https://github.com/browserbase/stagehand/commit/e234a0f80bf4c07bcc57265da216cbc4ab3bd19d) Thanks [@miguelg719](https://github.com/miguelg719)! - Announcing **Stagehand 2.0**! 🎉 We're thrilled to announce the release of Stagehand 2.0, bringing significant improvements to make browser automation more powerful, faster, and easier to use than ever before. ### 🚀 New Features - **Introducing `stagehand.agent`**: A powerful new way to integrate SOTA Computer use models or Browserbase's [Open Operator](https://operator.browserbase.com) into Stagehand with one line of code! Perfect for multi-step workflows and complex interactions. [Learn more](https://docs.stagehand.dev/concepts/agent) - **Lightning-fast `act` and `extract`**: Major performance improvements to make your automations run significantly faster. - **Enhanced Logging**: Better visibility into what's happening during automation with improved logging and debugging capabilities. - **Comprehensive Documentation**: A completely revamped documentation site with better examples, guides, and best practices. - **Improved Error Handling**: More descriptive errors and better error recovery to help you debug issues faster. ### 🛠️ Developer Experience - **Better TypeScript Support**: Enhanced type definitions and better IDE integration - **Better Error Messages**: Clearer, more actionable error messages to help you debug faster - **Improved Caching**: More reliable action caching for better performance We're excited to see what you build with Stagehand 2.0! For questions or support, join our [Slack community](https://stagehand.dev/slack). For more details, check out our [documentation](https://docs.stagehand.dev). ### Minor Changes - [#588](https://github.com/browserbase/stagehand/pull/588) [`ba9efc5`](https://github.com/browserbase/stagehand/commit/ba9efc5580a536bc3c158e507a6c6695825c2834) Thanks [@sameelarif](https://github.com/sameelarif)! - Added support for offloading agent tasks to the API. - [#600](https://github.com/browserbase/stagehand/pull/600) [`11e015d`](https://github.com/browserbase/stagehand/commit/11e015daac56dc961b8c8d54ce360fd00d4fee38) Thanks [@sameelarif](https://github.com/sameelarif)! - Added a `stagehand.history` array which stores an array of `act`, `extract`, `observe`, and `goto` calls made. Since this history array is stored on the `StagehandPage` level, it will capture methods even if indirectly called by an agent. - [#601](https://github.com/browserbase/stagehand/pull/601) [`1d22604`](https://github.com/browserbase/stagehand/commit/1d2260401e27bae25779a55bb2ed7b7153c34fd0) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add custom error classes - [#599](https://github.com/browserbase/stagehand/pull/599) [`75d8fb3`](https://github.com/browserbase/stagehand/commit/75d8fb36a67cd84eb55b509bf959edc7b05059da) Thanks [@miguelg719](https://github.com/miguelg719)! - cleaner logging with pino - [#609](https://github.com/browserbase/stagehand/pull/609) [`c92295d`](https://github.com/browserbase/stagehand/commit/c92295d8424dac1a4f81066ca260ade2d5fce80b) Thanks [@kamath](https://github.com/kamath)! - Removed deprecated fields and methods from Stagehand constructor and added cdpUrl to localBrowserLaunchOptions for custom CDP URLs support. - [#571](https://github.com/browserbase/stagehand/pull/571) [`73d6736`](https://github.com/browserbase/stagehand/commit/73d67368b88002c17814e46e75a99456bf355c4e) Thanks [@miguelg719](https://github.com/miguelg719)! - You can now use Computer Using Agents (CUA) natively in Stagehand for both Anthropic and OpenAI models! This unlocks a brand new frontier of applications for Stagehand users 🤘 - [#619](https://github.com/browserbase/stagehand/pull/619) [`7b0b996`](https://github.com/browserbase/stagehand/commit/7b0b9969a58014ae3e99b2054e4463b785073cfd) Thanks [@sameelarif](https://github.com/sameelarif)! - add disablePino flag to stagehand constructor params - [#620](https://github.com/browserbase/stagehand/pull/620) [`566e587`](https://github.com/browserbase/stagehand/commit/566e5877a1861e0eae5a118d34efe09d43a37098) Thanks [@kamath](https://github.com/kamath)! - You can now pass in an OpenAI instance as an `llmClient` to the Stagehand constructor! This allows you to use Stagehand with any OpenAI-compatible model, like Ollama, Gemini, etc., as well as OpenAI wrappers like Braintrust. - [#586](https://github.com/browserbase/stagehand/pull/586) [`c57dc19`](https://github.com/browserbase/stagehand/commit/c57dc19c448b8c2aab82953291f4e38f202c4729) Thanks [@sameelarif](https://github.com/sameelarif)! - Added native Stagehand agentic loop functionality. This allows you to build agentic workflows with a single prompt without using a computer-use model. To try it out, create a `stagehand.agent` without passing in a provider. ### Patch Changes - [#580](https://github.com/browserbase/stagehand/pull/580) [`179e17c`](https://github.com/browserbase/stagehand/commit/179e17c2d1c9837de49c776d9850a330a759e73f) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - refactor \_performPlaywrightMethod - [#608](https://github.com/browserbase/stagehand/pull/608) [`71ee10d`](https://github.com/browserbase/stagehand/commit/71ee10d50cb46e83d43fd783e1404569e6f317cf) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - added support for "scrolling to next/previous chunk" - [#594](https://github.com/browserbase/stagehand/pull/594) [`e483484`](https://github.com/browserbase/stagehand/commit/e48348412a6e651967ba22d097d5308af0e8d0a8) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - pass observeHandler into actHandler - [#569](https://github.com/browserbase/stagehand/pull/569) [`17e8b40`](https://github.com/browserbase/stagehand/commit/17e8b40f94b30f6e253443a4bbb8a3e364e58e38) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - you can now call stagehand.metrics to get token usage metrics. you can also set logInferenceToFile in stagehand config to log the entire call/response history from stagehand & the LLM. - [#617](https://github.com/browserbase/stagehand/pull/617) [`affa564`](https://github.com/browserbase/stagehand/commit/affa5646658399ab71ed08c1b9ce0fd776b46fca) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - use a11y tree for default extract - [#589](https://github.com/browserbase/stagehand/pull/589) [`0c4b1e7`](https://github.com/browserbase/stagehand/commit/0c4b1e7e6ff4b8a60af4a2d0d2056bff847227d5) Thanks [@miguelg719](https://github.com/miguelg719)! - Added CDP support for screenshots, find more about the benefits here: https://docs.browserbase.com/features/screenshots#why-use-cdp-for-screenshots%3F - [#584](https://github.com/browserbase/stagehand/pull/584) [`c7c1a80`](https://github.com/browserbase/stagehand/commit/c7c1a8066be33188ba1e900828045db61410025c) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix to remove unnecessary healtcheck ping on sdk - [#616](https://github.com/browserbase/stagehand/pull/616) [`2a27e1c`](https://github.com/browserbase/stagehand/commit/2a27e1c8e967befbbbb05ea71369878ac1573658) Thanks [@miguelg719](https://github.com/miguelg719)! - Fixed new opened tab handling for CUA models - [#582](https://github.com/browserbase/stagehand/pull/582) [`dfd24e6`](https://github.com/browserbase/stagehand/commit/dfd24e638ef3723d3a8a3a33ff7942af0ac4745f) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - support api usage for extract with no args - [#563](https://github.com/browserbase/stagehand/pull/563) [`98166d7`](https://github.com/browserbase/stagehand/commit/98166d76d30bc67d6b04b3d5c39f78f92c254b49) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - support scrolling in `act` - [#598](https://github.com/browserbase/stagehand/pull/598) [`53889d4`](https://github.com/browserbase/stagehand/commit/53889d4b6e772098beaba2e1ee5a24e6f07706bb) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix the open operator handler to work with anthropic - [#605](https://github.com/browserbase/stagehand/pull/605) [`b8beaec`](https://github.com/browserbase/stagehand/commit/b8beaec451a03eaa5d12281fe7c8d4eb9c9d7e81) Thanks [@sameelarif](https://github.com/sameelarif)! - Added support for resuming a Stagehand session created on the API. - [#612](https://github.com/browserbase/stagehand/pull/612) [`cd36068`](https://github.com/browserbase/stagehand/commit/cd3606854c465747c78b44763469dfdfa16db1b0) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - remove all logic related to dom based act - [#577](https://github.com/browserbase/stagehand/pull/577) [`4fdbf63`](https://github.com/browserbase/stagehand/commit/4fdbf6324a0dc68568bba73ea4d9018b2ed67849) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - remove debugDom - [#603](https://github.com/browserbase/stagehand/pull/603) [`2a14a60`](https://github.com/browserbase/stagehand/commit/2a14a607f3e7fa3ca9a02670afdc7e60ccfbfb3f) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - rm unused handlePossiblePageNavigation - [#614](https://github.com/browserbase/stagehand/pull/614) [`a59eaef`](https://github.com/browserbase/stagehand/commit/a59eaef67c2f4a0cb07bb0046fe7e93e2ba4dc41) Thanks [@kamath](https://github.com/kamath)! - override whatwg-url to avoid punycode warning - [#573](https://github.com/browserbase/stagehand/pull/573) [`c24f3c9`](https://github.com/browserbase/stagehand/commit/c24f3c9a58873c3920fab0f9891c2bf5245c9b5e) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - return act result in actFromObserve ## 1.14.0 ### Minor Changes - [#518](https://github.com/browserbase/stagehand/pull/518) [`516725f`](https://github.com/browserbase/stagehand/commit/516725fc1c5d12d22caac0078a118c77bfe033a8) Thanks [@sameelarif](https://github.com/sameelarif)! - `act()` can now use `observe()` under the hood, resulting in significant performance improvements. To opt-in to this change, set `slowDomBasedAct: false` in `ActOptions`. - [#483](https://github.com/browserbase/stagehand/pull/483) [`8c9445f`](https://github.com/browserbase/stagehand/commit/8c9445fde9724ae33eeeb1234fd5b9bbd418bfdb) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - When using `textExtract`, you can now do targetted extraction by passing an xpath string into extract via the `selector` parameter. This limits the dom processing step to a target element, reducing tokens and increasing speed. For example: ```typescript const weatherData = await stagehand.page.extract({ instruction: "extract the weather data for Sun, Feb 23 at 11PM", schema: z.object({ temperature: z.string(), weather_description: z.string(), wind: z.string(), humidity: z.string(), barometer: z.string(), visibility: z.string(), }), modelName, useTextExtract, selector: xpath, // xpath of the element to extract from }); ``` - [#556](https://github.com/browserbase/stagehand/pull/556) [`499a72d`](https://github.com/browserbase/stagehand/commit/499a72dc56009791ce065270b854b12fc5570050) Thanks [@kamath](https://github.com/kamath)! - You can now set a timeout for dom-based stagehand act! Do this in `act` with `timeoutMs` as a parameter, or set a global param to `actTimeoutMs` in Stagehand config. - [#544](https://github.com/browserbase/stagehand/pull/544) [`55c9673`](https://github.com/browserbase/stagehand/commit/55c9673c5948743b804d70646f425a61818c7789) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - you can now deterministically get the full text representation of a webpage by calling `extract()` (with no arguments) - [#538](https://github.com/browserbase/stagehand/pull/538) [`d898d5b`](https://github.com/browserbase/stagehand/commit/d898d5b9e1c3b80e62e72d36d1754b3e50d5a2b4) Thanks [@sameelarif](https://github.com/sameelarif)! - Added `gpt-4.5-preview` and `claude-3-7-sonnet-latest` as supported models. - [#523](https://github.com/browserbase/stagehand/pull/523) [`44cf7cc`](https://github.com/browserbase/stagehand/commit/44cf7cc9ac1209c97d9153281970899b10a2ddc9) Thanks [@kwt00](https://github.com/kwt00)! You can now natively run Cerebras LLMs! `cerebras-llama-3.3-70b` and `cerebras-llama-3.1-8b` are now supported models as long as `CEREBRAS_API_KEY` is set in your environment. - [#542](https://github.com/browserbase/stagehand/pull/542) [`cf7fe66`](https://github.com/browserbase/stagehand/commit/cf7fe665e6d1eeda97582ee2816f1dc3a66c6152) Thanks [@sankalpgunturi](https://github.com/sankalpgunturi)! You can now natively run Groq LLMs! `groq-llama-3.3-70b-versatile` and `groq-llama-3.3-70b-specdec` are now supported models as long as `GROQ_API_KEY` is set in your environment. ### Patch Changes - [#506](https://github.com/browserbase/stagehand/pull/506) [`e521645`](https://github.com/browserbase/stagehand/commit/e5216455ce3fc2a4f4f7aa5614ecc92354eb670c) Thanks [@miguelg719](https://github.com/miguelg719)! - fixing 5s timeout on actHandler - [#535](https://github.com/browserbase/stagehand/pull/535) [`3782054`](https://github.com/browserbase/stagehand/commit/3782054734dcd0346f84003ddd8e0e484b379459) Thanks [@miguelg719](https://github.com/miguelg719)! - Adding backwards compatibility to new act->observe pipeline by accepting actOptions - [#508](https://github.com/browserbase/stagehand/pull/508) [`270f666`](https://github.com/browserbase/stagehand/commit/270f6669f1638f52fd5cd3f133f76446ced6ef9f) Thanks [@miguelg719](https://github.com/miguelg719)! - Fixed stagehand to support multiple pages with an enhanced context - [#559](https://github.com/browserbase/stagehand/pull/559) [`18533ad`](https://github.com/browserbase/stagehand/commit/18533ad824722e4e699323248297e184bae9254e) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: continuously adjusting chunk size inside `act` - [#554](https://github.com/browserbase/stagehand/pull/554) [`5f1868b`](https://github.com/browserbase/stagehand/commit/5f1868bd95478b3eb517319ebca7b0af4e91d144) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix targetted extract issue with scrollintoview and not chunking correctly - [#555](https://github.com/browserbase/stagehand/pull/555) [`fc5e8b6`](https://github.com/browserbase/stagehand/commit/fc5e8b6c5a606da96e6ed572dc8ffc6caef57576) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix issue where processAllOfDom doesnt scroll to end of page when there is dynamic content - [#552](https://github.com/browserbase/stagehand/pull/552) [`a25a4cb`](https://github.com/browserbase/stagehand/commit/a25a4cb538d64f50b5bd834dd88e8e6086a73078) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - accept xpaths with 'xpath=' prepended to the front in addition to xpaths without - [#534](https://github.com/browserbase/stagehand/pull/534) [`f0c162a`](https://github.com/browserbase/stagehand/commit/f0c162a6b4d1ac72c42f26462d7241a08b5c4e0a) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - call this.end() if the process exists - [#528](https://github.com/browserbase/stagehand/pull/528) [`c820bfc`](https://github.com/browserbase/stagehand/commit/c820bfcfc9571fea90afd1595775c5946118cfaf) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - handle attempt to close session that has already been closed when using the api - [#520](https://github.com/browserbase/stagehand/pull/520) [`f49eebd`](https://github.com/browserbase/stagehand/commit/f49eebd98c1d61413a3ea4c798595db601d55da8) Thanks [@miguelg719](https://github.com/miguelg719)! - Performing act from a 'not-supported' ObserveResult will now throw an informed error ## 1.13.1 ### Patch Changes - [#509](https://github.com/browserbase/stagehand/pull/509) [`a7d345e`](https://github.com/browserbase/stagehand/commit/a7d345e75434aebb656e1aa5aa61caed00dc99a8) Thanks [@miguelg719](https://github.com/miguelg719)! - Bun runs will now throw a more informed error ## 1.13.0 ### Minor Changes - [#486](https://github.com/browserbase/stagehand/pull/486) [`33f2b3f`](https://github.com/browserbase/stagehand/commit/33f2b3f8deff86ac2073b6d35b7413b0aeaba2f9) Thanks [@sameelarif](https://github.com/sameelarif)! - [Unreleased] Parameterized offloading Stagehand method calls to the Stagehand API. In the future, this will allow for better observability and debugging experience. - [#494](https://github.com/browserbase/stagehand/pull/494) [`9ba4b0b`](https://github.com/browserbase/stagehand/commit/9ba4b0b563cbc77d40cac31c11e17e365a9d1749) Thanks [@pkiv](https://github.com/pkiv)! - Added LocalBrowserLaunchOptions to provide comprehensive configuration options for local browser instances. Deprecated the top-level headless option in favor of using localBrowserLaunchOptions.headless - [#500](https://github.com/browserbase/stagehand/pull/500) [`a683fab`](https://github.com/browserbase/stagehand/commit/a683fab9ca90c45d78f6602a228c2d3219b776dc) Thanks [@miguelg719](https://github.com/miguelg719)! - Including Iframes in ObserveResults. This appends any iframe(s) found in the page to the end of observe results on any observe call. - [#504](https://github.com/browserbase/stagehand/pull/504) [`577662e`](https://github.com/browserbase/stagehand/commit/577662e985a6a6b0477815853d98610f3a6b567d) Thanks [@sameelarif](https://github.com/sameelarif)! - Enabled support for Browserbase captcha solving after page navigations. This can be enabled with the new constructor parameter: `waitForCaptchaSolves`. - [#496](https://github.com/browserbase/stagehand/pull/496) [`28ca9fb`](https://github.com/browserbase/stagehand/commit/28ca9fbc6f3cdc88437001108a9a6c4388ba0303) Thanks [@sameelarif](https://github.com/sameelarif)! - Fixed browserbaseSessionCreateParams not being passed in to the API initialization payload. ### Patch Changes - [#459](https://github.com/browserbase/stagehand/pull/459) [`62a29ee`](https://github.com/browserbase/stagehand/commit/62a29eea982bbb855e2f885c09ac4c1334f3e0dc) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - create a11y + dom hybrid input for observe - [#463](https://github.com/browserbase/stagehand/pull/463) [`e40bf6f`](https://github.com/browserbase/stagehand/commit/e40bf6f517331fc9952c3c9f2683b7e02ffb9735) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - include 'Scrollable' annotations in a11y-dom hybrid - [#480](https://github.com/browserbase/stagehand/pull/480) [`4c07c44`](https://github.com/browserbase/stagehand/commit/4c07c444f0e71faf54413b2eeab760c7916a36e3) Thanks [@miguelg719](https://github.com/miguelg719)! - Adding a fallback try on actFromObserveResult to use the description from observe and call regular act. - [#487](https://github.com/browserbase/stagehand/pull/487) [`2c855cf`](https://github.com/browserbase/stagehand/commit/2c855cffdfa2b0af9924612b9c59df7b65df6443) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - update refine extraction prompt to ensure correct schema is used - [#497](https://github.com/browserbase/stagehand/pull/497) [`945ed04`](https://github.com/browserbase/stagehand/commit/945ed0426d34d2cb833aec8ba67bd4cba6c3b660) Thanks [@kamath](https://github.com/kamath)! - add gpt 4o november snapshot ## 1.12.0 ### Minor Changes - [#426](https://github.com/browserbase/stagehand/pull/426) [`bbbcee7`](https://github.com/browserbase/stagehand/commit/bbbcee7e7d86f5bf90cbb93f2ac9ad5935f15896) Thanks [@miguelg719](https://github.com/miguelg719)! - Observe got a major upgrade. Now it will return a suggested playwright method with any necessary arguments for the generated candidate elements. It also includes a major speedup when using a11y tree processing for context. - [#452](https://github.com/browserbase/stagehand/pull/452) [`16837ec`](https://github.com/browserbase/stagehand/commit/16837ece839e192fbf7b68bec128dd02f22c2613) Thanks [@kamath](https://github.com/kamath)! - add o3-mini to availablemodel - [#441](https://github.com/browserbase/stagehand/pull/441) [`1032d7d`](https://github.com/browserbase/stagehand/commit/1032d7d7d9c1ef8f30183c9019ea8324f1bdd5c6) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - allow act to accept observe output ### Patch Changes - [#458](https://github.com/browserbase/stagehand/pull/458) [`da2e5d1`](https://github.com/browserbase/stagehand/commit/da2e5d1314b7504877fd50090e6a4b47f44fb9f6) Thanks [@miguelg719](https://github.com/miguelg719)! - Updated getAccessibilityTree() to make sure it doesn't skip useful nodes. Improved getXPathByResolvedObjectId() to account for text nodes and not skip generation - [#448](https://github.com/browserbase/stagehand/pull/448) [`b216072`](https://github.com/browserbase/stagehand/commit/b2160723923ed78eba83e75c7270634ca7d217de) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - improve handling of radio button clicks - [#445](https://github.com/browserbase/stagehand/pull/445) [`5bc514f`](https://github.com/browserbase/stagehand/commit/5bc514fc18e6634b1c81553bbc1e8b7d71b67d34) Thanks [@miguelg719](https://github.com/miguelg719)! - Adding back useAccessibilityTree param to observe with a deprecation warning/error indicating to use onlyVisible instead ## 1.11.0 ### Minor Changes - [#428](https://github.com/browserbase/stagehand/pull/428) [`5efeb5a`](https://github.com/browserbase/stagehand/commit/5efeb5ad44852efe7b260862729a5ac74eaa0228) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - temporarily remove vision ## 1.10.1 ### Patch Changes - [#422](https://github.com/browserbase/stagehand/pull/422) [`a2878d0`](https://github.com/browserbase/stagehand/commit/a2878d0acaf393b37763fb0c07b1a24043f7eb8d) Thanks [@miguelg719](https://github.com/miguelg719)! - Fixing a build type error for async functions being called inside evaulate for observeHandler. ## 1.10.0 ### Minor Changes - [#412](https://github.com/browserbase/stagehand/pull/412) [`4aa4813`](https://github.com/browserbase/stagehand/commit/4aa4813ad62cefc333a04ea6b1004f5888dec70f) Thanks [@miguelg719](https://github.com/miguelg719)! - Includes a new format to get website context using accessibility (a11y) trees. The new context is provided optionally with the flag useAccessibilityTree for observe tasks. - [#417](https://github.com/browserbase/stagehand/pull/417) [`1f2b2c5`](https://github.com/browserbase/stagehand/commit/1f2b2c57d93e3b276c61224e1e26c65c2cb50e12) Thanks [@sameelarif](https://github.com/sameelarif)! - Simplify Stagehand method calls by allowing a simple string input instead of an options object. - [#405](https://github.com/browserbase/stagehand/pull/405) [`0df1e23`](https://github.com/browserbase/stagehand/commit/0df1e233d4ad4ba39da457b6ed85916d8d20e12e) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - in ProcessAllOfDom, scroll on large scrollable elements instead of just the root DOM - [#373](https://github.com/browserbase/stagehand/pull/373) [`ff00965`](https://github.com/browserbase/stagehand/commit/ff00965160d568ae0bc3ca437c01f95b5c6e9039) Thanks [@sameelarif](https://github.com/sameelarif)! - Allow the input of custom instructions into the constructor so that users can guide, or provide guardrails to, the LLM in making decisions. ### Patch Changes - [#386](https://github.com/browserbase/stagehand/pull/386) [`2cee0a4`](https://github.com/browserbase/stagehand/commit/2cee0a45ae2b48d1de6543b196e338e7021e59fe) Thanks [@kamath](https://github.com/kamath)! - add demo gif - [#362](https://github.com/browserbase/stagehand/pull/362) [`9c20de3`](https://github.com/browserbase/stagehand/commit/9c20de3e66f0ac20374d5e5e02eb107c620a2263) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - reduce collisions and improve accuracy of textExtract - [#413](https://github.com/browserbase/stagehand/pull/413) [`737b4b2`](https://github.com/browserbase/stagehand/commit/737b4b208c9214e8bb22535ab7a8daccf37610d9) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - remove topMostElement check when verifying visibility of text nodes - [#388](https://github.com/browserbase/stagehand/pull/388) [`e93561d`](https://github.com/browserbase/stagehand/commit/e93561d7875210ce7bd7fe841fb52decf6011fb3) Thanks [@kamath](https://github.com/kamath)! - Export LLMClient type ## 1.9.0 ### Minor Changes - [#374](https://github.com/browserbase/stagehand/pull/374) [`207244e`](https://github.com/browserbase/stagehand/commit/207244e3a46c4474d4d28db039eab131164790ca) Thanks [@sameelarif](https://github.com/sameelarif)! - Pass in a Stagehand Page object into the `on("popup")` listener to allow for multi-page handling. - [#367](https://github.com/browserbase/stagehand/pull/367) [`75c0e20`](https://github.com/browserbase/stagehand/commit/75c0e20cde54951399753e0fa841df463e1271b8) Thanks [@kamath](https://github.com/kamath)! - Logger in LLMClient is inherited by default from Stagehand. Named rather than positional arguments are used in implemented LLMClients. - [#381](https://github.com/browserbase/stagehand/pull/381) [`db2ef59`](https://github.com/browserbase/stagehand/commit/db2ef5997664e81b1dfb5ca992392362f2d3bab1) Thanks [@kamath](https://github.com/kamath)! - make logs only sync - [#385](https://github.com/browserbase/stagehand/pull/385) [`5899ec2`](https://github.com/browserbase/stagehand/commit/5899ec2c4b73c636bfd8120ec3aac225af7dd949) Thanks [@sameelarif](https://github.com/sameelarif)! - Moved the LLMClient logger paremeter to the createChatCompletion method options. - [#364](https://github.com/browserbase/stagehand/pull/364) [`08907eb`](https://github.com/browserbase/stagehand/commit/08907ebbc2cb47cfc3151946764656a7f4ce99c6) Thanks [@kamath](https://github.com/kamath)! - exposed llmClient in stagehand constructor ### Patch Changes - [#383](https://github.com/browserbase/stagehand/pull/383) [`a77efcc`](https://github.com/browserbase/stagehand/commit/a77efccfde3a3948013eda3a52935e8a21d45b3e) Thanks [@sameelarif](https://github.com/sameelarif)! - Unified LLM input/output types for reduced dependence on OpenAI types - [`b7b3701`](https://github.com/browserbase/stagehand/commit/b7b370160bf35b09f5dc132f6e86f6e34fb70a85) Thanks [@kamath](https://github.com/kamath)! - Fix $1-types exposed to the user - [#353](https://github.com/browserbase/stagehand/pull/353) [`5c6f14b`](https://github.com/browserbase/stagehand/commit/5c6f14bade201e08cb86d2e14e246cb65707f7ee) Thanks [@kamath](https://github.com/kamath)! - Throw custom error if context is referenced without initialization, remove act/extract handler from index - [#360](https://github.com/browserbase/stagehand/pull/360) [`89841fc`](https://github.com/browserbase/stagehand/commit/89841fc42ae82559baddfe2a9593bc3260c082a2) Thanks [@kamath](https://github.com/kamath)! - Remove stagehand nav entirely - [#379](https://github.com/browserbase/stagehand/pull/379) [`b1c6579`](https://github.com/browserbase/stagehand/commit/b1c657976847de86d82324030f90c2f6a1f3f976) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - dont require LLM Client to use non-ai stagehand functions - [#371](https://github.com/browserbase/stagehand/pull/371) [`30e7d09`](https://github.com/browserbase/stagehand/commit/30e7d091445004c71aec1748d3a7d75fb86d1f11) Thanks [@kamath](https://github.com/kamath)! - pretty readme :) - [#382](https://github.com/browserbase/stagehand/pull/382) [`a41271b`](https://github.com/browserbase/stagehand/commit/a41271baf351e20f4c79b4b654d8a947b615a121) Thanks [@sameelarif](https://github.com/sameelarif)! - Added example implementation of the Vercel AI SDK as an LLMClient - [#344](https://github.com/browserbase/stagehand/pull/344) [`c1cf345`](https://github.com/browserbase/stagehand/commit/c1cf34535ed30262989b1dbe262fb0414cdf8230) Thanks [@kamath](https://github.com/kamath)! - Remove duplicate logging and expose Page/BrowserContext types ## 1.8.0 ### Minor Changes - [#324](https://github.com/browserbase/stagehand/pull/324) [`cd23fa3`](https://github.com/browserbase/stagehand/commit/cd23fa33450107f29cb1ddb6edadfc769d336aa5) Thanks [@kamath](https://github.com/kamath)! - Move stagehand.act() -> stagehand.page.act() and deprecate stagehand.act() - [#319](https://github.com/browserbase/stagehand/pull/319) [`bacbe60`](https://github.com/browserbase/stagehand/commit/bacbe608058304bfa1f0ab049da4d8aa90e8d6f7) Thanks [@kamath](https://github.com/kamath)! - We now wrap playwright page/context within StagehandPage and StagehandContext objects. This helps us augment the Stagehand experience by being able to augment the underlying Playwright - [#324](https://github.com/browserbase/stagehand/pull/324) [`cd23fa3`](https://github.com/browserbase/stagehand/commit/cd23fa33450107f29cb1ddb6edadfc769d336aa5) Thanks [@kamath](https://github.com/kamath)! - moves extract and act -> page and deprecates stagehand.extract and stagehand.observe ### Patch Changes - [#320](https://github.com/browserbase/stagehand/pull/320) [`c0cdd0e`](https://github.com/browserbase/stagehand/commit/c0cdd0e985d66f0464d2e70b7d0cb343b0efbd3f) Thanks [@kamath](https://github.com/kamath)! - bug fix: set this.env to LOCAL if BROWSERBASE_API_KEY is not defined - [#325](https://github.com/browserbase/stagehand/pull/325) [`cc46f34`](https://github.com/browserbase/stagehand/commit/cc46f345c0a1dc0af4abae7e207833df17da50e7) Thanks [@pkiv](https://github.com/pkiv)! - only start domdebug if enabled ## 1.7.0 ### Minor Changes - [#316](https://github.com/browserbase/stagehand/pull/316) [`902e633`](https://github.com/browserbase/stagehand/commit/902e633e126a58b80b757ea0ecada01a7675a473) Thanks [@kamath](https://github.com/kamath)! - rename browserbaseResumeSessionID -> browserbaseSessionID - [#296](https://github.com/browserbase/stagehand/pull/296) [`f11da27`](https://github.com/browserbase/stagehand/commit/f11da27a20409c240ceeea2003d520f676def61a) Thanks [@kamath](https://github.com/kamath)! - - Deprecate fields in `init` in favor of constructor options - Deprecate `initFromPage` in favor of `browserbaseResumeSessionID` in constructor - Rename `browserBaseSessionCreateParams` -> `browserbaseSessionCreateParams` - [#304](https://github.com/browserbase/stagehand/pull/304) [`0b72f75`](https://github.com/browserbase/stagehand/commit/0b72f75f6a62aaeb28b0c488ae96db098d6a2846) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add textExtract: an optional, text based approach to the existing extract method. textExtract often performs better on long form extraction tasks. By default `extract` uses the existing approach `domExtract`. - [#298](https://github.com/browserbase/stagehand/pull/298) [`55f0cd2`](https://github.com/browserbase/stagehand/commit/55f0cd2fe7976e800833ec6e41e9af62d88d09d5) Thanks [@kamath](https://github.com/kamath)! - Add sessionId to public params ### Patch Changes - [#283](https://github.com/browserbase/stagehand/pull/283) [`b902192`](https://github.com/browserbase/stagehand/commit/b902192bc7ff8eb02c85150c1fe6f89c2a95b211) Thanks [@sameelarif](https://github.com/sameelarif)! - allowed customization of eval config via .env - [#299](https://github.com/browserbase/stagehand/pull/299) [`fbe2300`](https://github.com/browserbase/stagehand/commit/fbe23007176488043c2415519f25021612fff989) Thanks [@sameelarif](https://github.com/sameelarif)! - log playwright actions for better debugging ## 1.6.0 ### Minor Changes - [#286](https://github.com/browserbase/stagehand/pull/286) [`9605836`](https://github.com/browserbase/stagehand/commit/9605836ee6b8207ed7dc9146e12ced1c78630d59) Thanks [@kamath](https://github.com/kamath)! - minor improvement in action + new eval case - [#279](https://github.com/browserbase/stagehand/pull/279) [`d6d7057`](https://github.com/browserbase/stagehand/commit/d6d70570623a718354797ef83aa8489eacc085d1) Thanks [@kamath](https://github.com/kamath)! - Add support for o1-mini and o1-preview in OpenAIClient - [#282](https://github.com/browserbase/stagehand/pull/282) [`5291797`](https://github.com/browserbase/stagehand/commit/529179724a53bf2fd578a4012fd6bc6b7348d1ae) Thanks [@kamath](https://github.com/kamath)! - Added eslint for stricter type checking. Streamlined most of the internal types throughout the cache, llm, and handlers. This should make it easier to add new LLMs down the line, maintain and update the existing code, and make it easier to add new features in the future. Types can be checked by running `npx eslint .` from the project directory. ### Patch Changes - [#270](https://github.com/browserbase/stagehand/pull/270) [`6b10b3b`](https://github.com/browserbase/stagehand/commit/6b10b3b1160649b19f50d66588395ceb679b3d68) Thanks [@sameelarif](https://github.com/sameelarif)! - add close link to readme - [#288](https://github.com/browserbase/stagehand/pull/288) [`5afa0b9`](https://github.com/browserbase/stagehand/commit/5afa0b940a9f379a3719a5bbae249dd2a9ef8380) Thanks [@kamath](https://github.com/kamath)! - add multi-region support for browserbase - [#284](https://github.com/browserbase/stagehand/pull/284) [`474217c`](https://github.com/browserbase/stagehand/commit/474217cfaff8e68614212b66baa62d35493fd2ce) Thanks [@kamath](https://github.com/kamath)! - Build wasn't working, this addresses tsc failure. - [#236](https://github.com/browserbase/stagehand/pull/236) [`85483fe`](https://github.com/browserbase/stagehand/commit/85483fe091544fc079015c62b6923b03f8b9caa7) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - reduce chunk size ## 1.5.0 ### Minor Changes - [#266](https://github.com/browserbase/stagehand/pull/266) [`0e8f34f`](https://github.com/browserbase/stagehand/commit/0e8f34fc15aee91c548d09534deaccc8adca7c4d) Thanks [@kamath](https://github.com/kamath)! - Install wasn't working from NPM due to misconfigured build step. This attempts to fix that. ## 1.4.0 ### Minor Changes - [#253](https://github.com/browserbase/stagehand/pull/253) [`598cae2`](https://github.com/browserbase/stagehand/commit/598cae230c7b8d4e31ae22fd63047a91b63e51b8) Thanks [@sameelarif](https://github.com/sameelarif)! - clean up contexts after use ### Patch Changes - [#225](https://github.com/browserbase/stagehand/pull/225) [`a2366fe`](https://github.com/browserbase/stagehand/commit/a2366feb023180fbb2ccc7a8379692f9f8347fe5) Thanks [@sameelarif](https://github.com/sameelarif)! - Ensuring cross-platform compatibility with tmp directories - [#249](https://github.com/browserbase/stagehand/pull/249) [`7d06d43`](https://github.com/browserbase/stagehand/commit/7d06d43f2b9a477fed35793d7479de9b183e8d53) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix broken evals - [#227](https://github.com/browserbase/stagehand/pull/227) [`647eefd`](https://github.com/browserbase/stagehand/commit/647eefd651852eec495faa1b8f4dbe6b1da17999) Thanks [@kamath](https://github.com/kamath)! - Fix debugDom still showing chunks when set to false - [#250](https://github.com/browserbase/stagehand/pull/250) [`5886620`](https://github.com/browserbase/stagehand/commit/5886620dd1b0a57c68bf810cf130df2ca0a50a69) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add ci specific evals - [#222](https://github.com/browserbase/stagehand/pull/222) [`8dff026`](https://github.com/browserbase/stagehand/commit/8dff02674df7a6448f2262c7e212b58c03be57bc) Thanks [@sameelarif](https://github.com/sameelarif)! - Streamline type definitions and fix existing typescript errors - [#232](https://github.com/browserbase/stagehand/pull/232) [`b9f9949`](https://github.com/browserbase/stagehand/commit/b9f99494021e6a9e2487b77bb64ed0a491751400) Thanks [@kamath](https://github.com/kamath)! - Minor changes to package.json and tsconfig, mainly around the build process. Also add more type defs and remove unused dependencies. ## 1.3.0 ### Minor Changes - [#195](https://github.com/browserbase/stagehand/pull/195) [`87a6305`](https://github.com/browserbase/stagehand/commit/87a6305d9a2faf1ab5915965913bc14d5cc15772) Thanks [@kamath](https://github.com/kamath)! - - Adds structured and more standardized JSON logging - Doesn't init cache if `enableCaching` is false, preventing `tmp/.cache` from being created - Updates bundling for browser-side code to support NextJS and serverless ## 1.2.0 ### Minor Changes - [#179](https://github.com/browserbase/stagehand/pull/179) [`0031871`](https://github.com/browserbase/stagehand/commit/0031871d5a6d6180f272a68b88a8634e5a991785) Thanks [@navidkpr](https://github.com/navidkpr)! - Fixes: The last big change we pushed out, introduced a small regression. As a result, the gray outline showing the elements Stagehand is looking out is missing. This commit fixes that. We now process selectorMap properly now (using the updated type Record

The production-ready framework for AI browser automations.
Read the Docs

MIT License Slack Community

browserbase%2Fstagehand | Trendshift

## Why Stagehand? Most existing browser automation tools either require you to write low-level code in a framework like Selenium, Playwright, or Puppeteer, or use high-level agents that can be unpredictable in production. By letting developers choose what to write in code vs. natural language, Stagehand is the natural choice for browser automations in production. 1. **Choose when to write code vs. natural language**: use AI when you want to navigate unfamiliar pages, and use code ([Playwright](https://playwright.dev/)) when you know exactly what you want to do. 2. **Preview and cache actions**: Stagehand lets you preview AI actions before running them, and also helps you easily cache repeatable actions to save time and tokens. 3. **Computer use models with one line of code**: Stagehand lets you integrate SOTA computer use models from OpenAI and Anthropic into the browser with one line of code. ## Example Here's how to build a sample browser automation with Stagehand:
See Stagehand in Action
```typescript // Use Playwright functions on the page object const page = stagehand.page; await page.goto("https://github.com/browserbase"); // Use act() to execute individual actions await page.act("click on the stagehand repo"); // Use Computer Use agents for larger actions const agent = stagehand.agent({ provider: "openai", model: "computer-use-preview", }); await agent.execute("Get to the latest PR"); // Use extract() to read data from the page const { author, title } = await page.extract({ instruction: "extract the author and title of the PR", schema: z.object({ author: z.string().describe("The username of the PR author"), title: z.string().describe("The title of the PR"), }), }); ``` ## Documentation Visit [docs.stagehand.dev](https://docs.stagehand.dev) to view the full documentation. ## Getting Started Start with Stagehand with one line of code, or check out our [Quickstart Guide](https://docs.stagehand.dev/get_started/quickstart) for more information: ```bash npx create-browser-app ```

Watch Anirudh demo create-browser-app to create a Stagehand project!

### Build and Run from Source ```bash git clone https://github.com/browserbase/stagehand.git cd stagehand npm install npx playwright install npm run build npm run example # run the blank script at ./examples/example.ts ``` Stagehand is best when you have an API key for an LLM provider and Browserbase credentials. To add these to your project, run: ```bash cp .env.example .env nano .env # Edit the .env file to add API keys ``` ## Contributing > [!NOTE] > We highly value contributions to Stagehand! For questions or support, please join our [Slack community](https://stagehand.dev/slack). At a high level, we're focused on improving reliability, speed, and cost in that order of priority. If you're interested in contributing, we strongly recommend reaching out to [Anirudh Kamath](https://x.com/kamathematic) or [Paul Klein](https://x.com/pk_iv) in our [Slack community](https://stagehand.dev/slack) before starting to ensure that your contribution aligns with our goals. For more information, please see our [Contributing Guide](https://docs.stagehand.dev/contributions/contributing). ## Acknowledgements This project heavily relies on [Playwright](https://playwright.dev/) as a resilient backbone to automate the web. It also would not be possible without the awesome techniques and discoveries made by [tarsier](https://github.com/reworkd/tarsier), [gemini-zod](https://github.com/jbeoris/gemini-zod), and [fuji-web](https://github.com/normal-computing/fuji-web). We'd like to thank the following people for their major contributions to Stagehand: - [Paul Klein](https://github.com/pkiv) - [Anirudh Kamath](https://github.com/kamath) - [Sean McGuire](https://github.com/seanmcguire12) - [Miguel Gonzalez](https://github.com/miguelg719) - [Sameel Arif](https://github.com/sameelarif) - [Filip Michalsky](https://github.com/filip-michalsky) - [Jeremy Press](https://x.com/jeremypress) - [Navid Pour](https://github.com/navidpour) ## License Licensed under the MIT License. Copyright 2025 Browserbase, Inc. ## /docs/logging.md # Stagehand Logging System The Stagehand logging system uses [Pino](https://getpino.io/) to provide structured, efficient, and configurable logging. ## Log Levels Stagehand uses three primary log levels: | Level | Name | Description | | ----- | ----- | --------------------------------------- | | 0 | error | Critical errors and important warnings | | 1 | info | Standard information messages (default) | | 2 | debug | Detailed information for debugging | The verbosity of logging is controlled by the `verbose` option when creating a Stagehand instance: ```typescript const stagehand = new Stagehand({ verbose: 2, // Show all logs up to debug level // other options... }); ``` ## Using the Logger The logging system is automatically initialized with your Stagehand instance. You can access it directly via: ```typescript // Log an error stagehand.log({ message: "An error occurred", level: 0, category: "error", }); // Log info (level 1 is default) stagehand.log({ message: "Operation completed", category: "operation", }); // Log debug information stagehand.log({ message: "Debug details", level: 2, category: "debug", auxiliary: { details: { value: JSON.stringify({ key: "value" }), type: "object", }, }, }); ``` ## Inference Logging For detailed logging of inference operations (act, extract, observe), Stagehand provides specialized logging: ```typescript // Enable inference logging to file const stagehand = new Stagehand({ logInferenceToFile: true, // other options... }); ``` When enabled, inference logs are written to the `inference_summary` directory in your project. ## Pretty Printing By default, logs in development are formatted with colors and readable timestamps using Pino's pretty formatting. For production environments or when sending logs to external systems, you can disable pretty printing. ## Customizing Logging ### Using Your Own Logger You can provide your own custom logger when creating a Stagehand instance: ```typescript const stagehand = new Stagehand({ logger: (logLine) => { // Your custom logging logic here console.log(`[${logLine.category}] ${logLine.message}`); }, // other options... }); ``` When you provide a custom logger, Stagehand will automatically disable its internal Pino logger to prevent duplicate logging. Your logger will receive all log events directly. ### Configuring Pino If you want to use Pino but with custom configuration: ```typescript import { StagehandLogger } from "@browserbasehq/stagehand/lib/logger"; // Create a custom configured logger const customLogger = new StagehandLogger({ pretty: true, level: "debug", // Other Pino options... }); // Pass it to Stagehand const stagehand = new Stagehand({ logger: (logLine) => customLogger.log(logLine), // other options... }); ``` ## Advanced Usage ### Creating a New StagehandLogger Instance You can create a standalone logger for use in your application: ```typescript import { StagehandLogger } from "@browserbasehq/stagehand/lib/logger"; const logger = new StagehandLogger({ pretty: true, level: "debug", }); logger.info("Information message"); logger.debug("Debug message", { details: "some data" }); logger.error("Error message", { error: "details" }); ``` ### Configuring Log Output You can direct logs to a file or other destination: ```typescript import fs from "fs"; import { StagehandLogger } from "@browserbasehq/stagehand/lib/logger"; const fileStream = fs.createWriteStream("./logs/application.log", { flags: "a", }); const logger = new StagehandLogger({ destination: fileStream, }); ``` ### Disabling Pino Explicitly If you want to handle all logging yourself without using Pino: ```typescript import { StagehandLogger } from "@browserbasehq/stagehand/lib/logger"; const logger = new StagehandLogger( { usePino: false, }, (logLine) => { // Your custom logging logic console.log(`[${logLine.level}] ${logLine.message}`); }, ); ``` ## Troubleshooting If you're not seeing logs: 1. Check your `verbose` setting - it may be too low for the log levels you're trying to see 2. Verify that your log messages have the correct level set 3. If using a custom logger, ensure it's correctly handling the log messages If you're seeing duplicate logs: 1. Make sure you're not creating multiple instances of StagehandLogger that log to the same output 2. If providing a custom logger to Stagehand, it will automatically disable the internal Pino logger If logs are not being written to files: 1. Ensure you have write permissions to the target directory 2. Check that the `logInferenceToFile` option is enabled 3. Verify that the destination path exists or can be created ## /docs/media/chunks.png Binary file available at https://raw.githubusercontent.com/browserbase/stagehand/refs/heads/main/docs/media/chunks.png ## /docs/media/stagehand-playwright.png Binary file available at https://raw.githubusercontent.com/browserbase/stagehand/refs/heads/main/docs/media/stagehand-playwright.png ## /docs/release.md # Releasing We use [Changesets](https://github.com/changesets/changesets) to version and release our packages. When we merge to main, the release workflow will: 1. Create a release pull request with: - A version bump for the package calculated by the changesets. - A changelog entry summarizing the changes in the release. 1. Create an `alpha` version of the package with whatever is merged to main, and you can install it with `npm install @browserbasehq/stagehand@alpha`. This is useful for testing the release before it's published to the `latest` tag. When the pull request is merged, the release workflow will publish the package to npm with the version calculated by the changesets. For more information on how changesets work, see the [changesets docs](https://github.com/changesets/changesets) and our [release.yml file](/.github/workflows/release.yml). # Manually Releasing > [!WARNING] > You should not need to manually release unless absolutely necessary. Our automated release workflow handles this for you when changes are merged to main. When you're ready to cut a release, start by versioning the packages: ``` npx changeset version ``` This will consume the changesets in [`.changeset`](../.changeset) and update the [changelog](../CHANGELOG.md) and [`package.json`](../package.json): ``` % git status --short M CHANGELOG.md M package.json ``` Based on the versions implications declared by the changesets, the package version will be updated to the next patch, minor, or major: ```diff "name": "@browserbasehq/stagehand", - "version": "1.3.0", + "version": "1.3.1", ``` Since we updated the `package.json`, we should also update the lockfile ([`package-lock.json`](../package-lock.json)) for tidiness: ``` npm install ``` Now the lockfile should be updated: ``` % git status --short M CHANGELOG.md M package-lock.json M package.json ``` The diff will look something like this: ```diff { "name": "@browserbasehq/stagehand", - "version": "1.3.0", + "version": "1.3.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@browserbasehq/stagehand", - "version": "1.3.0", + "version": "1.3.1", ``` At this point we're ready to commit our changes. It's probably a good idea to have some consistency around the name of this commit message: ``` git commit -am 'Version Packages' ``` Ok, now it's time to publish the release. Before we do, we have to build the artifacts that comprise the tarball. Let's clean our working directory first so that we don't accidentally include anything in the tarball that shouldn't be there: ``` % git clean -fxd -e .env Removing dist/ Removing lib/dom/build/ Removing node_modules/ ``` Let's reinstall dependencies and build the artifacts: ``` npm install && npm run build ``` Now we're ready to publish to NPM. You have to be logged in via the `npm` CLI and have to be part of the `@browserbasehq` org: ``` npx changeset publish ``` Congratulations! You just published a new version of `@browserbasehq/stagehand`. 🤘 In the process of publishing, Changesets created an [annotated git tag](https://git-scm.com/book/en/v2/Git-Basics-Tagging): ``` 🦋 Creating git tag... 🦋 New tag: v1.3.1 ``` Let's push the commit and tag to GitHub for posterity: ``` git push --follow-tags ``` ## /eslint.config.mjs ```mjs path="/eslint.config.mjs" import globals from "globals"; import pluginJs from "@eslint/js"; import tseslint from "typescript-eslint"; /** @type {import('eslint').Linter.Config[]} */ export default [ { files: ["**/*.{js,mjs,cjs,ts}"] }, { languageOptions: { globals: globals.browser } }, { ignores: ["**/dist/**", "lib/dom/build/**"] }, pluginJs.configs.recommended, ...tseslint.configs.recommended, ]; ``` ## /evals/args.ts ```ts path="/evals/args.ts" import process from "process"; import { EvalCategorySchema } from "@/types/evals"; const rawArgs = process.argv.slice(2); const parsedArgs: { env?: string; trials?: number; concurrency?: number; extractMethod?: string; provider?: string; leftover: string[]; } = { leftover: [], }; for (const arg of rawArgs) { if (arg.startsWith("env=")) { parsedArgs.env = arg.split("=")[1]?.toLowerCase(); } else if (arg.startsWith("trials=")) { const val = parseInt(arg.split("=")[1], 10); if (!isNaN(val)) { parsedArgs.trials = val; } } else if (arg.startsWith("concurrency=")) { const val = parseInt(arg.split("=")[1], 10); if (!isNaN(val)) { parsedArgs.concurrency = val; } } else if (arg.startsWith("--extract-method=")) { parsedArgs.extractMethod = arg.split("=")[1]; } else if (arg.startsWith("provider=")) { parsedArgs.provider = arg.split("=")[1]?.toLowerCase(); } else { parsedArgs.leftover.push(arg); } } /** Apply environment defaults or overrides */ if (parsedArgs.env === "browserbase") { process.env.EVAL_ENV = "BROWSERBASE"; } else if (parsedArgs.env === "local") { process.env.EVAL_ENV = "LOCAL"; } if (parsedArgs.trials !== undefined) { process.env.EVAL_TRIAL_COUNT = String(parsedArgs.trials); } if (parsedArgs.concurrency !== undefined) { process.env.EVAL_MAX_CONCURRENCY = String(parsedArgs.concurrency); } const extractMethod = parsedArgs.extractMethod || "domExtract"; process.env.EXTRACT_METHOD = extractMethod; const useTextExtract = extractMethod === "textExtract"; const useAccessibilityTree = extractMethod === "accessibilityTree"; const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES ? process.env.EVAL_CATEGORIES.split(",") : [ "observe", "act", "combination", "extract", "experimental", "text_extract", "targeted_extract", "regression_llm_providers", "regression", "llm_clients", "agent", ]; // Finally, interpret leftover arguments to see if user typed "category X" or a single eval name let filterByCategory: string | null = null; let filterByEvalName: string | null = null; if (parsedArgs.leftover.length > 0) { if (parsedArgs.leftover[0].toLowerCase() === "category") { filterByCategory = parsedArgs.leftover[1]; if (!filterByCategory) { console.error("Error: Category name not specified."); process.exit(1); } try { EvalCategorySchema.parse(filterByCategory); } catch { console.error( `Error: Invalid category "${filterByCategory}". Valid categories are: ${DEFAULT_EVAL_CATEGORIES.join(", ")}`, ); process.exit(1); } } else { // If leftover[0] is not "category", interpret it as a task/eval name filterByEvalName = parsedArgs.leftover[0]; } } if (parsedArgs.provider !== undefined) { process.env.EVAL_PROVIDER = parsedArgs.provider; } export { filterByCategory, filterByEvalName, useTextExtract, useAccessibilityTree, DEFAULT_EVAL_CATEGORIES, parsedArgs, }; ``` ## /evals/assets/cart.html ```html path="/evals/assets/cart.html" Document
``` ## /evals/assets/peeler.html ```html path="/evals/assets/peeler.html" Document

Welcome to Our Page

Knife Set

High-quality stainless steel knives for all your cooking needs.my stuff more stuff

Peeler

The ultimate tool for peeling fruits and vegetables.

hi world

Baseball evolved from older bat-and-ball games already being played in England by the mid-18th century. This game was brought by immigrants to North America, where the modern version developed.

``` ## /evals/deterministic/auxiliary/logo.png Binary file available at https://raw.githubusercontent.com/browserbase/stagehand/refs/heads/main/evals/deterministic/auxiliary/logo.png ## /evals/deterministic/bb.playwright.config.ts ```ts path="/evals/deterministic/bb.playwright.config.ts" import { defineConfig, devices } from "@playwright/test"; /** * See https://playwright.dev/docs/test-configuration. */ export default defineConfig({ testDir: "./tests/browserbase", /* Fail the build on CI if you accidentally left test.only in the source code. */ /* Run tests in files in parallel */ fullyParallel: true, /* Reporter to use. See https://playwright.dev/docs/test-reporters */ // reporter: "html", reporter: "line", /* Retry on CI only */ retries: 2, /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */ use: { /* Collect trace when retrying the failed test. See https://playwright.dev/docs/trace-viewer */ trace: "on-first-retry", }, /* Configure projects for major browsers */ projects: [ { name: "chromium", use: { ...devices["Desktop Chrome"] }, }, ], }); ``` ## /evals/deterministic/e2e.playwright.config.ts ```ts path="/evals/deterministic/e2e.playwright.config.ts" import { defineConfig, devices } from "@playwright/test"; /** * See https://playwright.dev/docs/test-configuration. */ export default defineConfig({ // Look in "tests" for test files... testDir: "./tests", // ...but ignore anything in "tests/browserbase & "tests/local" testIgnore: ["**/browserbase/**", "**/local/**"], /* Fail the build on CI if you accidentally left test.only in the source code. */ /* Run tests in files in parallel */ fullyParallel: true, /* Reporter to use. See https://playwright.dev/docs/test-reporters */ // reporter: "html", reporter: "line", retries: 2, /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */ use: { /* Collect trace when retrying the failed test. See https://playwright.dev/docs/trace-viewer */ trace: "on-first-retry", }, /* Configure projects for major browsers */ projects: [ { name: "chromium", use: { ...devices["Desktop Chrome"] }, }, ], }); ``` ## /evals/deterministic/local.playwright.config.ts ```ts path="/evals/deterministic/local.playwright.config.ts" import { defineConfig, devices } from "@playwright/test"; /** * See https://playwright.dev/docs/test-configuration. */ export default defineConfig({ testDir: "./tests/local", /* Maximum time one test can run for. */ timeout: 30 * 1000, /* Fail the build on CI if you accidentally left test.only in the source code. */ forbidOnly: !!process.env.CI, /* Run tests in files in parallel */ fullyParallel: false, /* Reporter to use */ reporter: "line", /* Retry on CI only */ retries: process.env.CI ? 2 : 0, /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */ use: { /* Collect trace when retrying the failed test. See https://playwright.dev/docs/trace-viewer */ trace: "on-first-retry", }, /* Configure projects for major browsers */ projects: [ { name: "chromium", use: { ...devices["Desktop Chrome"] }, }, ], }); ``` ## /evals/deterministic/stagehand.config.ts ```ts path="/evals/deterministic/stagehand.config.ts" import { default as DefaultStagehandConfig } from "@/stagehand.config"; import type { ConstructorParams } from "@/dist"; import dotenv from "dotenv"; dotenv.config({ path: "../../.env" }); const StagehandConfig: ConstructorParams = { ...DefaultStagehandConfig, env: "LOCAL" /* Environment to run Stagehand in */, verbose: 1 /* Logging verbosity level (0=quiet, 1=normal, 2=verbose) */, browserbaseSessionCreateParams: { projectId: process.env.BROWSERBASE_PROJECT_ID, }, enableCaching: false /* Enable caching functionality */, localBrowserLaunchOptions: { headless: true /* Run browser in headless mode */, }, }; export default StagehandConfig; ``` ## /evals/deterministic/tests/BrowserContext/addInitScript.test.ts ```ts path="/evals/deterministic/tests/BrowserContext/addInitScript.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; test.describe("StagehandContext - addInitScript", () => { test("should inject a script on the context before pages load", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const context = stagehand.context; await context.addInitScript(() => { const w = window as typeof window & { __testContextScriptVar?: string; }; w.__testContextScriptVar = "Hello from context.initScript!"; }); const pageA = await context.newPage(); await pageA.goto("https://example.com"); const resultA = await pageA.evaluate(() => { const w = window as typeof window & { __testContextScriptVar?: string; }; return w.__testContextScriptVar; }); expect(resultA).toBe("Hello from context.initScript!"); const pageB = await context.newPage(); await pageB.goto("https://docs.browserbase.com"); const resultB = await pageB.evaluate(() => { const w = window as typeof window & { __testContextScriptVar?: string; }; return w.__testContextScriptVar; }); expect(resultB).toBe("Hello from context.initScript!"); await stagehand.close(); }); }); ``` ## /evals/deterministic/tests/BrowserContext/cookies.test.ts ```ts path="/evals/deterministic/tests/BrowserContext/cookies.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; test.describe("StagehandContext - Cookies", () => { let stagehand: Stagehand; test.beforeEach(async () => { stagehand = new Stagehand(StagehandConfig); await stagehand.init(); }); test.afterEach(async () => { await stagehand.close(); }); test("should add cookies and retrieve them", async () => { const context = stagehand.context; // This is the wrapped BrowserContext const url = "https://example.com"; await context.addCookies([ { name: "myCookie", value: "myValue", domain: "example.com", path: "/", expires: Math.floor(Date.now() / 1000) + 3600, httpOnly: false, secure: false, sameSite: "Lax", }, ]); const cookies = await context.cookies(url); expect(cookies.length).toBeGreaterThan(0); const myCookie = cookies.find((c) => c.name === "myCookie"); expect(myCookie).toBeDefined(); expect(myCookie?.value).toBe("myValue"); }); test("should clear all cookies", async () => { const context = stagehand.context; const url = "https://example.com"; await context.addCookies([ { name: "myOtherCookie", value: "anotherValue", domain: "example.com", path: "/", expires: Math.floor(Date.now() / 1000) + 3600, httpOnly: false, secure: false, sameSite: "Lax", }, ]); const cookiesBefore = await context.cookies(url); const found = cookiesBefore.some((c) => c.name === "myOtherCookie"); expect(found).toBe(true); await context.clearCookies(); const cookiesAfter = await context.cookies(url); const stillFound = cookiesAfter.some((c) => c.name === "myOtherCookie"); expect(stillFound).toBe(false); }); }); ``` ## /evals/deterministic/tests/BrowserContext/multiPage.test.ts ```ts path="/evals/deterministic/tests/BrowserContext/multiPage.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; import { Page } from "@/dist"; import http from "http"; import express from "express"; import { Server as WebSocketServer } from "ws"; test.describe("StagehandContext - Multi-page Support", () => { let stagehand: Stagehand; let server: http.Server; let wss: WebSocketServer; let serverPort: number; test.beforeAll(async () => { // Set up a local Express server const app = express(); // Serve test pages app.get("/page1", (_req, res) => { res.set("Content-Type", "text/html"); res.end(` Page 1

Page 1 Content

`); }); app.get("/page2", (_req, res) => { res.set("Content-Type", "text/html"); res.end(` Page 2

Page 2 Content

`); }); // Create the server on a random free port server = http.createServer(app); await new Promise((resolve) => { server.listen(0, () => resolve()); }); const address = server.address(); if (typeof address === "object" && address !== null) { serverPort = address.port; } else { throw new Error("Failed to get server port"); } // Set up WebSocket for future tests wss = new WebSocketServer({ server, path: "/socket" }); wss.on("connection", (ws) => { console.log("WebSocket client connected"); ws.send("Hello from server WebSocket"); }); }); test.beforeEach(async () => { stagehand = new Stagehand(StagehandConfig); await stagehand.init(); }); test.afterEach(async () => { await stagehand.close(); }); test.afterAll(async () => { wss?.close(); server?.close(); }); /** * Test enhanced page capabilities */ test("should provide enhanced capabilities for new pages", async () => { const context = stagehand.context; const newPage = await context.newPage(); // Verify enhanced methods expect(typeof newPage.act).toBe("function"); expect(typeof newPage.extract).toBe("function"); expect(typeof newPage.observe).toBe("function"); // Verify basic Playwright functionality expect(typeof newPage.goto).toBe("function"); expect(typeof newPage.click).toBe("function"); // Test navigation maintains capabilities await newPage.goto(`http://localhost:${serverPort}/page1`); expect(typeof newPage.act).toBe("function"); expect(await newPage.title()).toBe("Page 1"); }); /** * Test context.pages() functionality */ test("should return array of enhanced pages via context.pages()", async () => { const context = stagehand.context; // Create multiple pages const page1 = await context.newPage(); const page2 = await context.newPage(); await page1.goto(`http://localhost:${serverPort}/page1`); await page2.goto(`http://localhost:${serverPort}/page2`); const pages = context.pages(); expect(pages).toContain(page1); expect(pages).toContain(page2); // Verify all pages have enhanced capabilities for (const page of pages) { expect(typeof page.act).toBe("function"); expect(typeof page.extract).toBe("function"); expect(typeof page.observe).toBe("function"); } }); /** * Test popup handling */ test("should handle popups with enhanced capabilities", async () => { const mainPage = stagehand.page; let popupPage: Page | null = null; mainPage.on("popup", (page: Page) => { popupPage = page; }); await mainPage.goto(`http://localhost:${serverPort}/page1`); await mainPage.click("#popupBtn"); // Verify popup has enhanced capabilities expect(popupPage).not.toBeNull(); expect(typeof popupPage.act).toBe("function"); expect(typeof popupPage.extract).toBe("function"); expect(typeof popupPage.observe).toBe("function"); if (popupPage) { await popupPage.waitForLoadState(); expect(await popupPage.title()).toBe("Page 2"); } }); /** * Test page tracking and cleanup */ test("should properly track and cleanup pages", async () => { const context = stagehand.context; const initialPages = context.pages().length; const newPage = await context.newPage(); await newPage.goto(`http://localhost:${serverPort}/page1`); expect(context.pages().length).toBe(initialPages + 1); await newPage.close(); expect(context.pages().length).toBe(initialPages); }); /** * Test enhanced methods across pages */ test("should support enhanced methods across all pages", async () => { const page1 = await stagehand.context.newPage(); const page2 = await stagehand.context.newPage(); await page1.goto(`http://localhost:${serverPort}/page1`); await page2.goto(`http://localhost:${serverPort}/page2`); // Verify both pages have enhanced capabilities expect(typeof page1.act).toBe("function"); expect(typeof page1.extract).toBe("function"); expect(typeof page1.observe).toBe("function"); expect(typeof page2.act).toBe("function"); expect(typeof page2.extract).toBe("function"); expect(typeof page2.observe).toBe("function"); }); /** * Test active page tracking */ test("should update stagehand.page when creating new pages", async () => { const initialPage = stagehand.page; // Create a new page and verify it becomes active const newPage = await stagehand.context.newPage(); expect(stagehand.page).toBe(newPage); expect(stagehand.page).not.toBe(initialPage); // Navigate and verify it's still the active page await newPage.goto(`http://localhost:${serverPort}/page1`); expect(stagehand.page).toBe(newPage); expect(await stagehand.page.title()).toBe("Page 1"); // Create another page and verify it becomes active const anotherPage = await stagehand.context.newPage(); expect(stagehand.page).toBe(anotherPage); expect(stagehand.page).not.toBe(newPage); }); }); ``` ## /evals/deterministic/tests/BrowserContext/page.test.ts ```ts path="/evals/deterministic/tests/BrowserContext/page.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; import http from "http"; import express from "express"; import { Server as WebSocketServer } from "ws"; test.describe("StagehandContext - pages and newPage", () => { let stagehand: Stagehand; let server: http.Server; let wss: WebSocketServer; let serverPort: number; test.beforeAll(async () => { // 1. Spin up a local Express server const app = express(); // Serve a single page at "/" app.get("/", (_req, res) => { res.set("Content-Type", "text/html"); res.end(` Test Page

Hello from local server

`); }); // Create the server on a random free port server = http.createServer(app); await new Promise((resolve) => { server.listen(0, () => resolve()); }); const address = server.address(); if (typeof address === "object" && address !== null) { serverPort = address.port; } else { throw new Error("Failed to get server port"); } // Optionally set up a WebSocket for future tests wss = new WebSocketServer({ server, path: "/socket" }); wss.on("connection", (ws) => { console.log("WebSocket client connected"); ws.send("Hello from server WebSocket"); }); }); test.beforeEach(async () => { // 2. Create & init Stagehand for each test stagehand = new Stagehand(StagehandConfig); await stagehand.init(); }); test.afterEach(async () => { await stagehand.close(); }); test.afterAll(async () => { // Shut down local server wss?.close(); server?.close(); }); /** * Test context.newPage() and context.pages() */ test("should create multiple pages and list them via context.pages()", async () => { const context = stagehand.context; // Create multiple pages const page1 = await context.newPage(); const page2 = await context.newPage(); // Confirm context.pages() sees them const allPages = context.pages(); // We expect at least these 2 pages. If a default blank page existed, total might be more. // The key is that page1 & page2 are in the array: expect(allPages).toContain(page1); expect(allPages).toContain(page2); // Navigate page1 to the local server await page1.goto(`http://localhost:${serverPort}`); expect(await page1.title()).toBe("Test Page"); }); }); ``` ## /evals/deterministic/tests/BrowserContext/routing.test.ts ```ts path="/evals/deterministic/tests/BrowserContext/routing.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; import http from "http"; import express from "express"; import { Server as WebSocketServer } from "ws"; import fs from "fs"; import path from "path"; const HAR_CONTENT = `{ "log": { "version": "1.2", "creator": { "name": "PlaywrightTest", "version": "1.0" }, "entries": [ { "startedDateTime": "2023-01-01T00:00:00.000Z", "time": 5, "request": { "method": "GET", "url": "http://localhost/har-example.json", "httpVersion": "HTTP/1.1", "cookies": [], "headers": [], "queryString": [], "headersSize": -1, "bodySize": 0 }, "response": { "status": 200, "statusText": "OK", "httpVersion": "HTTP/1.1", "cookies": [], "headers": [{"name":"Content-Type","value":"application/json"}], "content": { "size": 27, "mimeType": "application/json", "text": "{\\"harKey\\":\\"harValue\\"}" }, "redirectURL": "", "headersSize": -1, "bodySize": 0 }, "cache": {}, "timings": { "send": 0, "wait": 5, "receive": 0 } } ] } }`; test.describe("StagehandContext - Routing APIs with dynamic setup", () => { let stagehand: Stagehand; let server: http.Server; let wss: WebSocketServer; let serverPort: number; test.beforeAll(async () => { const app = express(); app.get("/example.json", (_req, res) => { res.json({ original: "server-data" }); }); app.get("/har-example.json", (_req, res) => { res.json({ fromServer: "This should be replaced by HAR if routeFromHar is in effect", }); }); server = http.createServer(app); await new Promise((resolve) => { server.listen(0, () => resolve()); }); const address = server.address(); if (typeof address === "object" && address !== null) { serverPort = address.port; } else { throw new Error("Failed to get server port"); } // Set up a WebSocket endpoint at "/socket" wss = new WebSocketServer({ server, path: "/socket" }); wss.on("connection", (ws) => { console.log("WebSocket client connected"); ws.send("Hello from server WebSocket"); // Echo messages back ws.on("message", (message) => { console.log("Server received WS message:", message); ws.send(`Server echo: ${message}`); }); }); }); test.beforeEach(async () => { stagehand = new Stagehand(StagehandConfig); await stagehand.init(); }); test.afterEach(async () => { await stagehand.close(); }); test.afterAll(async () => { wss?.close(); server?.close(); }); test("should intercept requests, mock the response, handle websockets, and unroute them", async () => { const context = stagehand.context; const baseURL = `http://localhost:${serverPort}`; // 1. route: intercept "/example.json" and fulfill with a mock response await context.route("**/example.json", async (route) => { console.log("[route] Intercepting:", route.request().url()); // Mock the response entirely: await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify({ mockedData: 1234 }), }); }); // 2. routeWebSocket: intercept "/socket" await context.routeWebSocket("**/socket", async (pageSideRoute) => { console.log("Intercepting WebSocket at:", pageSideRoute.url()); // Connect to the real server const serverSideRoute = pageSideRoute.connectToServer(); // Page -> Server pageSideRoute.onMessage((msg) => { console.log("Page -> Server message:", msg); // Forward to server side serverSideRoute.send(msg); }); // Server -> Page serverSideRoute.onMessage((msg) => { console.log("Server -> Page message:", msg); pageSideRoute.send(msg); }); }); // 3. Open a page and fetch /example.json const page = await context.newPage(); await page.goto(baseURL); const fetchResult = await page.evaluate(async () => { const res = await fetch("/example.json"); return res.json(); }); // We should get the mocked data from our route, not the real 'server-data' expect(fetchResult.mockedData).toBe(1234); // 4. Test the WebSocket // We'll store messages from the server in an array so we can assert them const wsMessages: string[] = []; page.on("console", (msg) => { // We'll parse out the console logs we used for WebSocket if (msg.type() === "log") { wsMessages.push(msg.text()); } }); // Create a WS from the page await page.evaluate((port) => { const ws = new WebSocket(`ws://localhost:${port}/socket`); ws.onmessage = (evt) => { console.log(`WS message from server: ${evt.data}`); }; setTimeout(() => { // send a message from the page side ws.send("Hello from the client"); }, 1000); }, serverPort); // Wait a moment for messages await page.waitForTimeout(3000); // We expect the server to have initially sent "Hello from server WebSocket" // And also an echo of "Hello from the client" => "Server echo: Hello from the client" const initialHello = wsMessages.find((m) => m.includes("Hello from server WebSocket"), ); expect(initialHello).toBeTruthy(); const echoMessage = wsMessages.find((m) => m.includes("Server echo: Hello from the client"), ); expect(echoMessage).toBeTruthy(); // 5. unroute the JSON route await context.unroute("**/example.json"); // 6. confirm the WebSocket route is still active // do a second fetch -> This time it won't be mocked const fetchResult2 = await page.evaluate(async () => { const res = await fetch("/example.json"); return res.json(); }); // The real server returns { original: "server-data" } expect(fetchResult2.original).toBe("server-data"); // 7. unrouteAll await context.unrouteAll(); }); test("should demonstrate routeFromHar usage", async () => { const harPath = path.join(__dirname, "tmp-test.har"); const dynamicHar = HAR_CONTENT.replace( "http://localhost/har-example.json", `http://localhost:${serverPort}/har-example.json`, ); fs.writeFileSync(harPath, dynamicHar, "utf-8"); const context = stagehand.context; await context.routeFromHAR(harPath, { update: false }); const page = await context.newPage(); await page.goto(`http://localhost:${serverPort}/har-example.json`); const bodyText = await page.evaluate(() => document.body.innerText); console.log("HAR-based body text:", bodyText); expect(bodyText).toContain("harKey"); expect(bodyText).toContain("harValue"); await context.unrouteAll(); fs.unlinkSync(harPath); }); }); ``` ## /evals/deterministic/tests/Errors/apiKeyError.test.ts ```ts path="/evals/deterministic/tests/Errors/apiKeyError.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; import { z } from "zod"; test.describe("API key/LLMClient error", () => { test("Should confirm that we get an error if we call extract without LLM API key or LLMClient", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); await stagehand.page.goto("https://docs.browserbase.com/introduction"); let errorThrown: Error | null = null; try { await stagehand.page.extract({ instruction: "From the introduction page, extract the explanation of what Browserbase is.", schema: z.object({ stars: z.string().describe("the explanation of what Browserbase is"), }), }); } catch (error) { errorThrown = error as Error; } expect(errorThrown).toBeInstanceOf(Error); expect(errorThrown?.message).toContain( "No LLM API key or LLM Client configured", ); await stagehand.close(); }); test("Should confirm that we get an error if we call act without LLM API key or LLMClient", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); await stagehand.page.goto("https://docs.browserbase.com/introduction"); let errorThrown: Error | null = null; try { await stagehand.page.act({ action: "Click on the 'Quickstart' section", }); } catch (error) { errorThrown = error as Error; } expect(errorThrown).toBeInstanceOf(Error); expect(errorThrown?.message).toContain( "No LLM API key or LLM Client configured", ); await stagehand.close(); }); test("Should confirm that we get an error if we call observe without LLM API key or LLMClient", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); await stagehand.page.goto("https://docs.browserbase.com/introduction"); let errorThrown: Error | null = null; try { await stagehand.page.observe(); } catch (error) { errorThrown = error as Error; } expect(errorThrown).toBeInstanceOf(Error); expect(errorThrown?.message).toContain( "No LLM API key or LLM Client configured", ); await stagehand.close(); }); }); ``` ## /evals/deterministic/tests/browserbase/contexts.test.ts ```ts path="/evals/deterministic/tests/browserbase/contexts.test.ts" import Browserbase from "@browserbasehq/sdk"; import { expect, test } from "@playwright/test"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; import { Stagehand } from "@/dist"; // Configuration const CONTEXT_TEST_URL = "https://docs.browserbase.com"; const BROWSERBASE_PROJECT_ID = process.env.BROWSERBASE_PROJECT_ID!; const BROWSERBASE_API_KEY = process.env.BROWSERBASE_API_KEY!; const bb = new Browserbase({ apiKey: BROWSERBASE_API_KEY, }); // Helper functions function addHour(date: Date): number { const SECOND = 1000; return new Date(date.getTime() + 60 * 60 * 1000).getTime() / SECOND; } async function findCookie(stagehand: Stagehand, name: string) { const defaultContext = stagehand.context; const cookies = await defaultContext?.cookies(); return cookies?.find((cookie) => cookie.name === name); } async function createContext() { console.log("Creating a new context..."); const context = await bb.contexts.create({ projectId: BROWSERBASE_PROJECT_ID, }); const contextId = context.id; console.log(`Context created with ID: ${contextId}`); return contextId; } async function setRandomCookie(contextId: string, stagehand: Stagehand) { console.log( `Populating context ${contextId} during session ${stagehand.browserbaseSessionID}`, ); const page = stagehand.page; await page.goto(CONTEXT_TEST_URL, { waitUntil: "domcontentloaded" }); const now = new Date(); const testCookieName = `bb_${now.getTime().toString()}`; const testCookieValue = now.toISOString(); await stagehand.context.addCookies([ { domain: `.${new URL(CONTEXT_TEST_URL).hostname}`, expires: addHour(now), name: testCookieName, path: "/", value: testCookieValue, }, ]); expect(findCookie(stagehand, testCookieName)).toBeDefined(); console.log(`Set test cookie: ${testCookieName}=${testCookieValue}`); return { testCookieName, testCookieValue }; } test.describe("Contexts", () => { test("Persists and re-uses a context", async () => { let contextId: string; let testCookieName: string; let testCookieValue: string; let stagehand: Stagehand; await test.step("Create a context", async () => { contextId = await createContext(); }); await test.step("Instantiate Stagehand with the context to persist", async () => { // We will be adding cookies to the context in this session, so we need mark persist=true stagehand = new Stagehand({ ...StagehandConfig, browserbaseSessionCreateParams: { projectId: BROWSERBASE_PROJECT_ID, browserSettings: { context: { id: contextId, persist: true, }, }, }, }); await stagehand.init(); }); await test.step("Set a random cookie on the page", async () => { ({ testCookieName } = await setRandomCookie(contextId, stagehand)); const page = stagehand.page; await page.goto("https://www.google.com", { waitUntil: "domcontentloaded", }); await page.goBack(); }); await test.step("Validate cookie persistence between pages", async () => { const cookie = await findCookie(stagehand, testCookieName); const found = !!cookie; expect(found).toBe(true); console.log("Cookie persisted between pages:", found); await stagehand.close(); // Wait for context to persist console.log("Waiting for context to persist..."); await new Promise((resolve) => setTimeout(resolve, 5000)); }); await test.step("Create another session with the same context", async () => { // We don't need to persist cookies in this session, so we can mark persist=false const newStagehand = new Stagehand({ ...StagehandConfig, browserbaseSessionCreateParams: { projectId: BROWSERBASE_PROJECT_ID, browserSettings: { context: { id: contextId, persist: false, }, }, }, }); await newStagehand.init(); console.log( `Reusing context ${contextId} during session ${newStagehand.browserbaseSessionID}`, ); const newPage = newStagehand.page; await newPage.goto(CONTEXT_TEST_URL, { waitUntil: "domcontentloaded" }); const foundCookie = await findCookie(newStagehand, testCookieName); console.log("Cookie found in new session:", !!foundCookie); console.log( "Cookie value matches:", foundCookie?.value === testCookieValue, ); await newStagehand.close(); }); }); }); ``` ## /evals/deterministic/tests/browserbase/downloads.test.ts ```ts path="/evals/deterministic/tests/browserbase/downloads.test.ts" import { test, expect } from "@playwright/test"; import AdmZip from "adm-zip"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; import { Stagehand } from "@/dist"; import Browserbase from "@browserbasehq/sdk"; const downloadRe = /sandstorm-(\d{13})+\.mp3/; test("Downloads", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; const context = stagehand.context; const client = await context.newCDPSession(page); await client.send("Browser.setDownloadBehavior", { behavior: "allow", // `downloadPath` gets appended to the browser's default download directory. // set to "downloads", it ends up being "/app/apps/browser/downloads/". downloadPath: "downloads", eventsEnabled: true, }); await page.goto("https://browser-tests-alpha.vercel.app/api/download-test"); const [download] = await Promise.all([ page.waitForEvent("download"), page.locator("#download").click(), ]); const downloadError = await download.failure(); await stagehand.close(); if (downloadError !== null) { throw new Error( `Download for session ${stagehand.browserbaseSessionID} failed: ${downloadError}`, ); } expect(async () => { const bb = new Browserbase(); const zipBuffer = await bb.sessions.downloads.list( stagehand.browserbaseSessionID, ); if (!zipBuffer) { throw new Error( `Download buffer is empty for session ${stagehand.browserbaseSessionID}`, ); } const zip = new AdmZip(Buffer.from(await zipBuffer.arrayBuffer())); const zipEntries = zip.getEntries(); const mp3Entry = zipEntries.find((entry) => downloadRe.test(entry.entryName), ); if (!mp3Entry) { throw new Error( `Session ${stagehand.browserbaseSessionID} is missing a file matching "${downloadRe.toString()}" in its zip entries: ${JSON.stringify(zipEntries.map((entry) => entry.entryName))}`, ); } const expectedFileSize = 6137541; expect(mp3Entry.header.size).toBe(expectedFileSize); }).toPass({ timeout: 30_000, }); }); ``` ## /evals/deterministic/tests/browserbase/sessions.test.ts ```ts path="/evals/deterministic/tests/browserbase/sessions.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; import Browserbase from "@browserbasehq/sdk"; test.describe("Browserbase Sessions", () => { let browserbase: Browserbase; let sessionId: string; let bigStagehand: Stagehand; test.beforeAll(async () => { browserbase = new Browserbase({ apiKey: process.env.BROWSERBASE_API_KEY, }); bigStagehand = new Stagehand({ ...StagehandConfig, env: "BROWSERBASE", browserbaseSessionCreateParams: { projectId: process.env.BROWSERBASE_PROJECT_ID, keepAlive: true, }, }); await bigStagehand.init(); await bigStagehand.page.goto( "https://docs.stagehand.dev/get_started/introduction", ); sessionId = bigStagehand.browserbaseSessionID; if (!sessionId) { throw new Error("Failed to get browserbase session ID"); } }); test.afterAll(async () => { await bigStagehand.close(); }); test("resumes a session via sessionId", async () => { const stagehand = new Stagehand({ ...StagehandConfig, env: "BROWSERBASE", browserbaseSessionID: sessionId, }); await stagehand.init(); const page = stagehand.page; expect(page.url()).toBe( "https://docs.stagehand.dev/get_started/introduction", ); await stagehand.close(); }); test("resumes a session via CDP URL", async () => { const session = await browserbase.sessions.retrieve(sessionId); const stagehand = new Stagehand({ ...StagehandConfig, env: "LOCAL", localBrowserLaunchOptions: { headless: true, cdpUrl: session.connectUrl, }, }); await stagehand.init(); const page = stagehand.page; expect(page.url()).toBe( "https://docs.stagehand.dev/get_started/introduction", ); }); }); ``` ## /evals/deterministic/tests/browserbase/uploads.test.ts ```ts path="/evals/deterministic/tests/browserbase/uploads.test.ts" import { join } from "node:path"; import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; test.describe("Playwright Upload", () => { let stagehand: Stagehand; test.beforeAll(async () => { stagehand = new Stagehand(StagehandConfig); await stagehand.init(); }); test.afterAll(async () => { await stagehand.close(); }); test("uploads a file", async () => { const page = stagehand.page; await page.goto("https://browser-tests-alpha.vercel.app/api/upload-test"); const fileInput = page.locator("#fileUpload"); await fileInput.setInputFiles( join(__dirname, "../..", "auxiliary", "logo.png"), ); const fileNameSpan = page.locator("#fileName"); const fileName = await fileNameSpan.innerText(); const fileSizeSpan = page.locator("#fileSize"); const fileSize = Number(await fileSizeSpan.innerText()); expect(fileName).toBe("logo.png"); expect(fileSize).toBeGreaterThan(0); }); }); ``` ## /evals/deterministic/tests/local/create.test.ts ```ts path="/evals/deterministic/tests/local/create.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import path from "path"; import fs from "fs"; import os from "os"; import type { Cookie } from "@playwright/test"; import StagehandConfig from "../../stagehand.config"; test.describe("Local browser launch options", () => { test("launches with default options when no localBrowserLaunchOptions provided", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const context = stagehand.context; expect(context.browser()).toBeDefined(); expect(context.pages().length).toBe(1); await stagehand.close(); }); test("respects custom userDataDir", async () => { const customUserDataDir = path.join(os.tmpdir(), "custom-user-data"); const stagehand = new Stagehand({ ...StagehandConfig, localBrowserLaunchOptions: { userDataDir: customUserDataDir, headless: true, }, }); await stagehand.init(); expect(fs.existsSync(customUserDataDir)).toBeTruthy(); await stagehand.close(); // Cleanup fs.rmSync(customUserDataDir, { recursive: true, force: true }); }); test("applies custom viewport settings", async () => { const customViewport = { width: 1920, height: 1080 }; const stagehand = new Stagehand({ ...StagehandConfig, localBrowserLaunchOptions: { ...StagehandConfig.localBrowserLaunchOptions, viewport: customViewport, }, }); await stagehand.init(); const page = await stagehand.context.newPage(); const viewport = page.viewportSize(); expect(viewport).toEqual(customViewport); await stagehand.close(); }); test("applies custom cookies", async () => { const testCookies: Cookie[] = [ { name: "testCookie", value: "testValue", domain: "example.com", path: "/", expires: -1, httpOnly: false, secure: false, sameSite: "Lax" as const, }, ]; const stagehand = new Stagehand({ ...StagehandConfig, localBrowserLaunchOptions: { ...StagehandConfig.localBrowserLaunchOptions, cookies: testCookies, }, }); await stagehand.init(); const page = await stagehand.context.newPage(); await page.goto("https://example.com"); const cookies = await stagehand.context.cookies(); expect(cookies[0]).toMatchObject( testCookies[0] as unknown as Record, ); await stagehand.close(); }); test("applies custom geolocation settings", async () => { const customGeolocation = { latitude: 40.7128, longitude: -74.006, }; const stagehand = new Stagehand({ ...StagehandConfig, localBrowserLaunchOptions: { ...StagehandConfig.localBrowserLaunchOptions, geolocation: customGeolocation, permissions: ["geolocation"], }, }); await stagehand.init(); const page = await stagehand.context.newPage(); await page.goto("https://example.com"); const location = await page.evaluate(() => { return new Promise((resolve) => { navigator.geolocation.getCurrentPosition( (position) => { resolve({ latitude: position.coords.latitude, longitude: position.coords.longitude, }); }, () => resolve(null), ); }); }); expect(location).toEqual(customGeolocation); await stagehand.close(); }); test("applies custom timezone and locale", async () => { const stagehand = new Stagehand({ ...StagehandConfig, localBrowserLaunchOptions: { ...StagehandConfig.localBrowserLaunchOptions, locale: "ja-JP", timezoneId: "Asia/Tokyo", }, }); await stagehand.init(); const page = await stagehand.context.newPage(); await page.goto("https://example.com"); const { locale, timezone } = await page.evaluate(() => ({ locale: navigator.language, timezone: Intl.DateTimeFormat().resolvedOptions().timeZone, })); expect(locale).toBe("ja-JP"); expect(timezone).toBe("Asia/Tokyo"); await stagehand.close(); }); test("records video when enabled", async () => { const videoDir = path.join(os.tmpdir(), "test-videos"); fs.mkdirSync(videoDir, { recursive: true }); const stagehand = new Stagehand({ ...StagehandConfig, localBrowserLaunchOptions: { ...StagehandConfig.localBrowserLaunchOptions, recordVideo: { dir: videoDir, size: { width: 800, height: 600 }, }, }, }); await stagehand.init(); const page = await stagehand.context.newPage(); await page.goto("https://example.com"); await stagehand.close(); const videos = fs.readdirSync(videoDir); expect(videos.length).toBeGreaterThan(0); expect(videos[0]).toMatch(/\.webm$/); // Cleanup fs.rmSync(videoDir, { recursive: true, force: true }); }); }); ``` ## /evals/deterministic/tests/page/addInitScript.test.ts ```ts path="/evals/deterministic/tests/page/addInitScript.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; test.describe("StagehandPage - addInitScript", () => { test("should inject a script before the page loads", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; await page.addInitScript(() => { const w = window as typeof window & { __testInitScriptVar?: string; }; w.__testInitScriptVar = "Hello from init script!"; }); await page.goto("https://example.com"); const result = await page.evaluate(() => { const w = window as typeof window & { __testInitScriptVar?: string; }; return w.__testInitScriptVar; }); expect(result).toBe("Hello from init script!"); await page.goto("https://docs.browserbase.com/"); const resultAfterNavigation = await page.evaluate(() => { const w = window as typeof window & { __testInitScriptVar?: string; }; return w.__testInitScriptVar; }); expect(resultAfterNavigation).toBe("Hello from init script!"); await stagehand.close(); }); test("checks if init scripts are re-added and available even if they've been deleted", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; await page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/aigrant/", ); // delete the __stagehandInjected flag, and delete the // getScrollableElementXpaths function await page.evaluate(() => { delete window.getScrollableElementXpaths; delete window.__stagehandInjected; }); // attempt to call the getScrollableElementXpaths function // which we previously deleted. page.evaluate should realize // its been deleted and re-inject it const xpaths = await page.evaluate(() => { return window.getScrollableElementXpaths(); }); await stagehand.close(); // this is the only scrollable element on the page expect(xpaths).toContain("/html"); }); }); ``` ## /evals/deterministic/tests/page/addRemoveLocatorHandler.test.ts ```ts path="/evals/deterministic/tests/page/addRemoveLocatorHandler.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; test.describe("StagehandPage - addLocatorHandler and removeLocatorHandler", () => { // This HTML snippet is reused by both tests. // The "Sign up to the newsletter" overlay appears after 2 seconds. // The "No thanks" button hides it. const overlayHTML = ` `; test("should use a custom locator handler to dismiss the overlay", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const { page } = stagehand; await page.addLocatorHandler( page.getByText("Sign up to the newsletter"), async () => { console.log("Overlay detected. Clicking 'No thanks' to remove it..."); await page.getByRole("button", { name: "No thanks" }).click(); }, ); await page.goto("https://example.com"); await page.setContent(overlayHTML); await page.waitForTimeout(5000); await page.getByRole("button", { name: "Start here" }).click(); const isOverlayVisible = await page .getByText("Sign up to the newsletter") .isVisible() .catch(() => false); await stagehand.close(); expect(isOverlayVisible).toBeFalsy(); }); test("should remove a custom locator handler so overlay stays visible", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const { page } = stagehand; const locator = page.getByText("Sign up to the newsletter"); await page.addLocatorHandler(locator, async () => { console.log("Overlay detected. Clicking 'No thanks' to remove it..."); await page.getByRole("button", { name: "No thanks" }).click(); }); await page.removeLocatorHandler(locator); console.log("Locator handler removed — overlay will not be dismissed now."); await page.goto("https://example.com"); await page.setContent(overlayHTML); await page.waitForTimeout(5000); await page.getByRole("button", { name: "Start here" }).click(); const isOverlayVisible = await page .getByText("Sign up to the newsletter") .isVisible() .catch(() => false); await stagehand.close(); expect(isOverlayVisible).toBe(true); }); }); ``` ## /evals/deterministic/tests/page/addTags.test.ts ```ts path="/evals/deterministic/tests/page/addTags.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; test.describe("StagehandPage - addScriptTag and addStyleTag", () => { let stagehand: Stagehand; test.beforeAll(async () => { stagehand = new Stagehand(StagehandConfig); await stagehand.init(); }); test.afterAll(async () => { await stagehand.close(); }); test("should inject a script tag and have access to the defined function", async () => { const { page } = stagehand; await page.setContent(`

Hello, world!

`); await page.addScriptTag({ content: ` window.sayHello = function() { document.getElementById("greeting").textContent = "Hello from injected script!"; } `, }); await page.evaluate(() => { const w = window as typeof window & { sayHello?: () => void; }; w.sayHello?.(); }); const text = await page.locator("#greeting").textContent(); expect(text).toBe("Hello from injected script!"); }); test("should inject a style tag and apply styles", async () => { const { page } = stagehand; await page.setContent(`
Some text
`); await page.addStyleTag({ content: ` #styledDiv { color: red; font-weight: bold; } `, }); const color = await page.evaluate(() => { const el = document.getElementById("styledDiv"); return window.getComputedStyle(el!).color; }); expect(color).toBe("rgb(255, 0, 0)"); const fontWeight = await page.evaluate(() => { const el = document.getElementById("styledDiv"); return window.getComputedStyle(el!).fontWeight; }); expect(["bold", "700"]).toContain(fontWeight); }); }); ``` ## /evals/deterministic/tests/page/bringToFront.test.ts ```ts path="/evals/deterministic/tests/page/bringToFront.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; test.describe("StagehandPage - bringToFront", () => { test("should bring a background page to the front and allow further actions", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const { page: page1 } = stagehand; const page2 = await stagehand.context.newPage(); await page2.goto("https://example.com"); const page2Title = await page2.title(); console.log("Page2 Title:", page2Title); await page1.goto("https://www.google.com"); const page1TitleBefore = await page1.title(); console.log("Page1 Title before:", page1TitleBefore); await page1.bringToFront(); await page1.goto("https://docs.browserbase.com"); const page1TitleAfter = await page1.title(); console.log("Page1 Title after:", page1TitleAfter); await page2.bringToFront(); const page2URLBefore = page2.url(); console.log("Page2 URL before navigation:", page2URLBefore); await stagehand.close(); expect(page1TitleBefore).toContain("Google"); expect(page1TitleAfter).toContain("Browserbase"); expect(page2Title).toContain("Example Domain"); }); }); ``` ## /evals/deterministic/tests/page/content.test.ts ```ts path="/evals/deterministic/tests/page/content.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; test.describe("StagehandPage - content", () => { test("should retrieve the full HTML content of the page", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; await page.goto("https://example.com"); const html = await page.content(); expect(html).toContain("Example Domain"); expect(html).toContain("

Example Domain

"); await stagehand.close(); }); }); ``` ## /evals/deterministic/tests/page/evaluate.test.ts ```ts path="/evals/deterministic/tests/page/evaluate.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; test.describe("StagehandPage - JavaScript Evaluation", () => { test("can evaluate JavaScript in the page context", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; await page.goto("https://example.com"); const sum = await page.evaluate(() => 2 + 2); expect(sum).toBe(4); const pageTitle = await page.evaluate(() => document.title); expect(pageTitle).toMatch(/example/i); const obj = await page.evaluate(() => { return { message: "Hello from the browser", userAgent: navigator.userAgent, }; }); expect(obj).toHaveProperty("message", "Hello from the browser"); expect(obj.userAgent).toBeDefined(); await stagehand.close(); }); }); ``` ## /evals/deterministic/tests/page/expose.test.ts ```ts path="/evals/deterministic/tests/page/expose.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; test.describe("StagehandPage - evaluateHandle, exposeBinding, exposeFunction", () => { let stagehand: Stagehand; test.beforeAll(async () => { stagehand = new Stagehand(StagehandConfig); await stagehand.init(); }); test.afterAll(async () => { await stagehand.close(); }); test("demonstrates evaluateHandle, exposeBinding, and exposeFunction", async () => { const { page } = stagehand; await page.setContent(`
Initial Text
`); const divHandle = await page.evaluateHandle(() => { return document.getElementById("myDiv"); }); await divHandle.evaluate((div, newText) => { div.textContent = newText; }, "Text updated via evaluateHandle"); const text = await page.locator("#myDiv").textContent(); expect(text).toBe("Text updated via evaluateHandle"); await page.exposeBinding("myBinding", async (source, arg: string) => { console.log("myBinding called from page with arg:", arg); return `Node responded with: I got your message: "${arg}"`; }); const responseFromBinding = await page.evaluate(async () => { const w = window as typeof window & { myBinding?: (arg: string) => Promise; }; return w.myBinding?.("Hello from the browser"); }); expect(responseFromBinding).toMatch(/I got your message/); await page.exposeFunction("addNumbers", (a: number, b: number) => { return a + b; }); const sum = await page.evaluate(async () => { const w = window as typeof window & { addNumbers?: (a: number, b: number) => number; }; return w.addNumbers?.(3, 7); }); expect(sum).toBe(10); }); }); ``` ## /evals/deterministic/tests/page/frames.test.ts ```ts path="/evals/deterministic/tests/page/frames.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; test.describe("StagehandPage - frame operations", () => { let stagehand: Stagehand; test.beforeAll(async () => { stagehand = new Stagehand(StagehandConfig); await stagehand.init(); }); test.afterAll(async () => { await stagehand.close(); }); test("should use page.mainFrame(), page.frames(), page.frame(), and page.frameLocator()", async () => { const { page } = stagehand; await page.setContent(` `); await page.waitForSelector('iframe[name="frame-one"]'); await page.waitForSelector('iframe[name="frame-two"]'); const frames = page.frames(); console.log( "All frames found:", frames.map((f) => f.name()), ); expect(frames).toHaveLength(3); const mainFrame = page.mainFrame(); console.log("Main frame name:", mainFrame.name()); expect(mainFrame.name()).toBe(""); const frameOne = page.frame({ name: "frame-one" }); expect(frameOne).not.toBeNull(); const frameOneText = await frameOne?.locator("h1").textContent(); expect(frameOneText).toBe("Hello from Frame 1"); const frameTwoLocator = page.frameLocator("iframe[name='frame-two']"); const frameTwoText = await frameTwoLocator.locator("h1").textContent(); expect(frameTwoText).toBe("Hello from Frame 2"); const frameTwo = page.frame({ name: "frame-two" }); expect(frameTwo).not.toBeNull(); const frameTwoTextAgain = await frameTwo?.locator("h1").textContent(); expect(frameTwoTextAgain).toBe("Hello from Frame 2"); }); }); ``` ## /evals/deterministic/tests/page/getBy.test.ts ```ts path="/evals/deterministic/tests/page/getBy.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; test.describe("StagehandPage - Built-in locators", () => { let stagehand: Stagehand; test.beforeAll(async () => { stagehand = new Stagehand(StagehandConfig); await stagehand.init(); }); test.afterAll(async () => { await stagehand.close(); }); test("demonstrates getByAltText, getByLabel, getByPlaceholder, getByRole, getByTestId, getByText, getByTitle", async () => { const { page } = stagehand; await page.setContent(` Profile picture
Hello World!

This is some descriptive text on the page.

Site Title

`); const image = page.getByAltText("Profile picture"); await expect(image).toBeVisible(); const usernameInput = page.getByLabel("Username"); await expect(usernameInput).toBeVisible(); const emailInput = page.getByPlaceholder("Enter your email"); await expect(emailInput).toBeVisible(); const signInButton = page.getByRole("button", { name: "Sign in" }); await expect(signInButton).toBeVisible(); const greetingDiv = page.getByTestId("greeting"); await expect(greetingDiv).toHaveText("Hello World!"); const descriptiveText = page.getByText( "This is some descriptive text on the page.", ); await expect(descriptiveText).toBeVisible(); const heading = page.getByTitle("A heading for the page"); await expect(heading).toHaveText("Site Title"); }); }); ``` ## /evals/deterministic/tests/page/navigation.test.ts ```ts path="/evals/deterministic/tests/page/navigation.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; test.describe("StagehandPage - Navigation", () => { test("should navigate back and forward between pages", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; await page.goto("https://example.com"); expect(page.url()).toBe("https://example.com/"); await page.goto("https://docs.browserbase.com/introduction"); expect(page.url()).toBe("https://docs.browserbase.com/introduction"); await page.goBack(); expect(page.url()).toBe("https://example.com/"); await page.goForward(); expect(page.url()).toBe("https://docs.browserbase.com/introduction"); await stagehand.close(); }); }); ``` ## /evals/deterministic/tests/page/on.test.ts ```ts path="/evals/deterministic/tests/page/on.test.ts" import { expect, test } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; test.describe("StagehandPage - page.on()", () => { test("should click on the crewAI blog tab", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; await page.goto( "https://docs.browserbase.com/integrations/crew-ai/introduction", ); let clickPromise: Promise; page.on("popup", async (newPage) => { clickPromise = newPage.click( "body > div.page-wrapper > div.navbar-2.w-nav > div.padding-global.top-bot > div > div.navigation-left > nav > a:nth-child(7)", ); }); await page.goto( "https://docs.browserbase.com/integrations/crew-ai/introduction", ); await page.click( "#content-area > div.relative.mt-8.prose.prose-gray.dark\\:prose-invert > p:nth-child(2) > a", ); await clickPromise; await stagehand.close(); }); test("should close the new tab and navigate to it on the existing page", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; await page.goto( "https://docs.browserbase.com/integrations/crew-ai/introduction", ); let navigatePromise: Promise; page.on("popup", async (newPage) => { navigatePromise = Promise.allSettled([ newPage.close(), page.goto(newPage.url(), { waitUntil: "domcontentloaded" }), ]); }); // Click on the crewAI blog tab await page.click( "#content-area > div.relative.mt-8.prose.prose-gray.dark\\:prose-invert > p:nth-child(2) > a", ); await navigatePromise; await page.click( "body > div.page-wrapper > div.navbar-2.w-nav > div.padding-global.top-bot > div > div.navigation-left > nav > a:nth-child(3)", ); await page.waitForLoadState("domcontentloaded"); const currentUrl = page.url(); expect(currentUrl).toBe("https://www.crewai.com/open-source"); await stagehand.close(); }); test("should handle console events", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; await page.goto("https://example.com"); const messages: string[] = []; page.on("console", (msg) => { messages.push(msg.text()); }); await page.evaluate(() => console.log("Test console log")); expect(messages).toContain("Test console log"); await stagehand.close(); }); test("should handle dialog events", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; await page.goto("https://example.com"); page.on("dialog", async (dialog) => { expect(dialog.message()).toBe("Test alert"); await dialog.dismiss(); }); await page.evaluate(() => alert("Test alert")); await stagehand.close(); }); test("should handle request and response events", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; await page.goto("https://example.com"); const requests: string[] = []; const responses: string[] = []; page.on("request", (request) => { requests.push(request.url()); }); page.on("response", (response) => { responses.push(response.url()); }); await page.goto("https://example.com"); expect(requests).toContain("https://example.com/"); expect(responses).toContain("https://example.com/"); await stagehand.close(); }); }); ``` ## /evals/deterministic/tests/page/pageContext.test.ts ```ts path="/evals/deterministic/tests/page/pageContext.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; test.describe("StagehandPage - page.context()", () => { let stagehand: Stagehand; test.beforeEach(async () => { stagehand = new Stagehand(StagehandConfig); await stagehand.init(); }); test.afterEach(async () => { if (stagehand) { try { await stagehand.close(); } catch (error) { console.error("[afterEach] Error during stagehand.close():", error); } } else { console.log("[afterEach] Stagehand was not defined, skipping close()."); } }); test("should confirm page.context() and stagehand.context share state", async () => { const page = stagehand.page; const stagehandContext = stagehand.context; const pageContext = page.context(); await pageContext.addCookies([ { name: "stagehandTestCookie", value: "hello-stagehand", domain: "example.com", path: "/", expires: Math.floor(Date.now() / 1000) + 3600, // 1 hour httpOnly: false, secure: false, sameSite: "Lax", }, ]); const cookies = await stagehandContext.cookies("https://example.com"); const testCookie = cookies.find((c) => c.name === "stagehandTestCookie"); expect(testCookie).toBeDefined(); expect(testCookie?.value).toBe("hello-stagehand"); const extraPage = await pageContext.newPage(); await extraPage.goto("https://example.com"); const contextPages = stagehandContext.pages(); // The newly created page should be recognized by stagehandContext as well. const foundExtraPage = contextPages.find( (p) => p.url() === "https://example.com/", ); expect(foundExtraPage).toBeDefined(); }); }); ``` ## /evals/deterministic/tests/page/reload.test.ts ```ts path="/evals/deterministic/tests/page/reload.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; test.describe("StagehandPage - Reload", () => { test("should reload the page and reset page state", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; await page.goto("https://docs.browserbase.com/"); await page.evaluate(() => { const w = window as typeof window & { __testReloadMarker?: string; }; w.__testReloadMarker = "Hello Reload!"; }); const markerBeforeReload = await page.evaluate(() => { const w = window as typeof window & { __testReloadMarker?: string; }; return w.__testReloadMarker; }); expect(markerBeforeReload).toBe("Hello Reload!"); await page.reload(); const markerAfterReload = await page.evaluate(() => { const w = window as typeof window & { __testReloadMarker?: string; }; return w.__testReloadMarker; }); expect(markerAfterReload).toBeUndefined(); await stagehand.close(); }); }); ``` ## /evals/deterministic/tests/page/waitFor.test.ts ```ts path="/evals/deterministic/tests/page/waitFor.test.ts" import { test, expect } from "@playwright/test"; import { Stagehand } from "@/dist"; import StagehandConfig from "@/evals/deterministic/stagehand.config"; test.describe("StagehandPage - waitFor", () => { test("should wait for an element to become visible", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; await page.goto("https://docs.browserbase.com/introduction"); const dynamicElement = page.locator( "div.grid:nth-child(1) > a:nth-child(1) > div:nth-child(1)", ); const isVisibleBefore = await dynamicElement.isVisible(); expect(isVisibleBefore).toBe(false); const clickableElement = page.locator( "div.not-prose:nth-child(2) > a:nth-child(1) > div:nth-child(1)", ); await clickableElement.click(); await dynamicElement.waitFor({ state: "visible" }); const isVisibleAfter = await dynamicElement.isVisible(); expect(isVisibleAfter).toBe(true); await stagehand.close(); }); test("should wait for an element to be detached", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; await page.goto("https://docs.browserbase.com/introduction"); const disappearingElement = page.locator( "div.not-prose:nth-child(2) > a:nth-child(1) > div:nth-child(1)", ); await disappearingElement.click(); await disappearingElement.waitFor({ state: "detached" }); const isAttachedAfter = await disappearingElement.isVisible(); expect(isAttachedAfter).toBe(false); await stagehand.close(); }); test("should wait for a specific event (waitForEvent)", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; await page.goto("https://docs.browserbase.com/introduction"); const consolePromise = page.waitForEvent("console"); await page.evaluate(() => { console.log("Hello from the browser console!"); }); const consoleMessage = await consolePromise; expect(consoleMessage.text()).toBe("Hello from the browser console!"); await stagehand.close(); }); test("should wait for a function to return true (waitForFunction)", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; await page.goto("https://docs.browserbase.com/introduction"); await page.evaluate(() => { setTimeout(() => { const w = window as typeof window & { __stagehandFlag?: boolean; }; w.__stagehandFlag = true; }, 1000); }); await page.waitForFunction(() => { const w = window as typeof window & { __stagehandFlag?: boolean; }; return w.__stagehandFlag === true; }); const value = await page.evaluate(() => { const w = window as typeof window & { __stagehandFlag?: boolean; }; return w.__stagehandFlag; }); expect(value).toBe(true); await stagehand.close(); }); test("should wait for the load state (waitForLoadState)", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; await page.goto("https://docs.browserbase.com/introduction"); await page.waitForLoadState("networkidle"); const heroTitle = page.locator("h1"); await expect(heroTitle).toHaveText(/Documentation/i); await stagehand.close(); }); test("should wait for a specific request (waitForRequest)", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; const requestPromise = page.waitForRequest((req) => req.url().includes("mintlify"), ); await page.goto("https://docs.browserbase.com/introduction"); const matchingRequest = await requestPromise; expect(matchingRequest.url()).toContain("mintlify"); await stagehand.close(); }); test("should wait for a specific response (waitForResponse)", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; const responsePromise = page.waitForResponse( (res) => res.url().includes("introduction") && res.status() === 200, ); await page.goto("https://docs.browserbase.com/introduction"); const matchingResponse = await responsePromise; expect(await matchingResponse.text()).toContain("Browserbase"); await stagehand.close(); }); test("should wait for a URL (waitForURL)", async () => { const stagehand = new Stagehand(StagehandConfig); await stagehand.init(); const page = stagehand.page; await page.goto("https://docs.browserbase.com"); const getStartedLink = page.locator( "div.not-prose:nth-child(3) > a:nth-child(1) > div:nth-child(1)", ); await getStartedLink.click(); await page.waitForURL(/.*getting-started.*/); expect(page.url()).toContain("/getting-started"); await stagehand.close(); }); }); ``` ## /evals/env.ts ```ts path="/evals/env.ts" /** * Determine the current environment in which the evaluations are running: * - BROWSERBASE or LOCAL * * The environment is read from the EVAL_ENV environment variable. */ export const env: "BROWSERBASE" | "LOCAL" = process.env.EVAL_ENV?.toLowerCase() === "browserbase" ? "BROWSERBASE" : "LOCAL"; /** * Enable or disable caching based on the EVAL_ENABLE_CACHING environment variable. * Caching may improve performance by not re-fetching or re-computing certain results. * By default, caching is disabled unless explicitly enabled. */ export const enableCaching = process.env.EVAL_ENABLE_CACHING?.toLowerCase() === "true"; ``` ## /evals/evals.config.json ```json path="/evals/evals.config.json" { "tasks": [ { "name": "history", "categories": ["combination"] }, { "name": "expect_act_timeout", "categories": ["regression"] }, { "name": "extract_repo_name", "categories": ["extract"] }, { "name": "amazon_add_to_cart", "categories": ["act"] }, { "name": "instructions", "categories": ["regression", "combination"] }, { "name": "bidnet", "categories": ["act"] }, { "name": "ionwave", "categories": ["act", "regression"], "extract_method": "domExtract" }, { "name": "nonsense_action", "categories": ["act"] }, { "name": "peeler_simple", "categories": ["act"] }, { "name": "simple_google_search", "categories": ["act"] }, { "name": "vantechjournal", "categories": ["act"] }, { "name": "wikipedia", "categories": ["act"] }, { "name": "allrecipes", "categories": ["combination"] }, { "name": "arxiv", "categories": ["combination"] }, { "name": "extract_collaborators", "categories": ["combination"] }, { "name": "extract_github_commits", "categories": ["combination"] }, { "name": "imdb_movie_details", "categories": ["combination"] }, { "name": "peeler_complex", "categories": ["combination"] }, { "name": "sciquest", "categories": ["combination"] }, { "name": "wichita", "categories": ["combination", "regression"], "extract_method": "domExtract" }, { "name": "hn_aisdk", "categories": ["llm_clients"] }, { "name": "hn_langchain", "categories": ["llm_clients"] }, { "name": "hn_customOpenAI", "categories": ["llm_clients"] }, { "name": "apple", "categories": ["experimental"] }, { "name": "combination_sauce", "categories": ["experimental"] }, { "name": "costar", "categories": ["experimental"] }, { "name": "expedia", "categories": ["experimental"] }, { "name": "expedia_search", "categories": ["experimental"] }, { "name": "extract_aigrant_companies", "categories": ["text_extract", "regression"], "extract_method": "textExtract" }, { "name": "extract_capacitor_info", "categories": ["experimental", "text_extract"] }, { "name": "extract_partners", "categories": ["experimental"] }, { "name": "extract_press_releases", "categories": ["experimental", "text_extract"] }, { "name": "extract_snowshoeing_destinations", "categories": ["experimental", "text_extract"] }, { "name": "google_jobs", "categories": ["experimental"] }, { "name": "homedepot", "categories": ["experimental"] }, { "name": "rakuten_jp", "categories": ["experimental"] }, { "name": "stock_x", "categories": ["experimental"] }, { "name": "ted_talk", "categories": ["experimental"] }, { "name": "extract_baptist_health", "categories": ["extract"] }, { "name": "extract_github_stars", "categories": ["extract"] }, { "name": "extract_memorial_healthcare", "categories": ["extract", "regression"], "extract_method": "domExtract" }, { "name": "extract_nhl_stats", "categories": ["extract"] }, { "name": "extract_professional_info", "categories": ["extract"] }, { "name": "extract_csa", "categories": ["text_extract"] }, { "name": "extract_resistor_info", "categories": ["extract"] }, { "name": "extract_rockauto", "categories": ["extract"] }, { "name": "extract_staff_members", "categories": ["extract"] }, { "name": "ionwave_observe", "categories": ["observe"] }, { "name": "panamcs", "categories": ["observe"] }, { "name": "vanta_h", "categories": ["experimental"] }, { "name": "extract_area_codes", "categories": ["text_extract"] }, { "name": "extract_public_notices", "categories": ["text_extract"] }, { "name": "extract_jstor_news", "categories": ["text_extract"] }, { "name": "extract_apartments", "categories": ["text_extract"] }, { "name": "extract_zillow", "categories": ["text_extract"] }, { "name": "observe_github", "categories": ["observe", "regression"], "extract_method": "textExtract" }, { "name": "observe_vantechjournal", "categories": ["observe", "regression"], "extract_method": "textExtract" }, { "name": "observe_amazon_add_to_cart", "categories": ["observe"] }, { "name": "observe_simple_google_search", "categories": ["observe"] }, { "name": "observe_yc_startup", "categories": ["observe"] }, { "name": "observe_taxes", "categories": ["observe"] }, { "name": "observe_iframes1", "categories": ["regression", "observe"] }, { "name": "observe_iframes2", "categories": ["regression", "observe"] }, { "name": "extract_hamilton_weather", "categories": ["targeted_extract", "regression"], "extract_method": "textExtract" }, { "name": "extract_regulations_table", "categories": ["targeted_extract"] }, { "name": "extract_recipe", "categories": ["targeted_extract"] }, { "name": "extract_aigrant_targeted", "categories": ["targeted_extract"] }, { "name": "extract_aigrant_targeted_2", "categories": ["targeted_extract"] }, { "name": "extract_geniusee", "categories": ["targeted_extract"] }, { "name": "extract_geniusee_2", "categories": ["targeted_extract"] }, { "name": "scroll_50", "categories": ["regression", "act"] }, { "name": "scroll_75", "categories": ["regression", "act"] }, { "name": "nextChunk", "categories": ["regression", "act"] }, { "name": "prevChunk", "categories": ["regression", "act"] }, { "name": "google_flights", "categories": ["act"] }, { "name": "extract_jfk_links", "categories": ["extract"] }, { "name": "extract_single_link", "categories": ["extract"] }, { "name": "dropdown", "categories": ["act"] }, { "name": "radio_btn", "categories": ["act"] }, { "name": "checkboxes", "categories": ["act"] }, { "name": "agent/iframe_form", "categories": ["agent"] }, { "name": "agent/iframe_form_multiple", "categories": ["agent"] }, { "name": "agent/google_flights", "categories": ["agent"] }, { "name": "agent/sf_library_card", "categories": ["agent"] }, { "name": "agent/sf_library_card_multiple", "categories": ["agent"] } ] } ``` ## /evals/evaluator.ts ```ts path="/evals/evaluator.ts" /** * This class is responsible for evaluating the result of an agentic task. * The first version includes a VLM evaluator specifically prompted to evaluate the state of a task * usually represented as a screenshot. * The evaluator will reply with YES or NO given the state of the provided task. */ import { AvailableModel, ClientOptions, Stagehand } from "@/dist"; import { LLMResponseError } from "@/types/stagehandErrors"; import dotenv from "dotenv"; import { EvaluateOptions, EvaluationResult, BatchEvaluateOptions, } from "@/types/evaluator"; dotenv.config(); export class Evaluator { private stagehand: Stagehand; private modelName: AvailableModel; private modelClientOptions: ClientOptions | { apiKey: string }; // Define regex patterns directly in the class or as constants if preferred elsewhere private yesPattern = /^(YES|Y|TRUE|CORRECT|AFFIRMATIVE)/i; private noPattern = /^(NO|N|FALSE|INCORRECT|NEGATIVE)/i; constructor( stagehand: Stagehand, modelName?: AvailableModel, modelClientOptions?: ClientOptions, ) { this.stagehand = stagehand; this.modelName = modelName || "gemini-2.0-flash"; this.modelClientOptions = modelClientOptions || { apiKey: process.env.GOOGLE_API_KEY || "", }; } /** * Evaluates the current state of the page against a specific question. * Expects a JSON object response: { "evaluation": "YES" | "NO", "reasoning": "..." } * Returns the evaluation result with normalized response and success status. * * @param options - The options for evaluation * @returns A promise that resolves to an EvaluationResult * @throws Error if strictResponse is true and response is not clearly YES or NO, or if JSON parsing/validation fails. */ async evaluate(options: EvaluateOptions): Promise { const { question, systemPrompt = `You are an expert evaluator that confidently returns YES or NO given the state of a task (most times in the form of a screenshot) and a question. Provide a detailed reasoning for your answer. Return your response as a JSON object with the following format: { "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }`, screenshotDelayMs = 1000, strictResponse = false, } = options; await new Promise((resolve) => setTimeout(resolve, screenshotDelayMs)); const imageBuffer = await this.stagehand.page.screenshot(); const llmClient = this.stagehand.llmProvider.getClient( this.modelName, this.modelClientOptions, ); const response = await llmClient.createChatCompletion({ logger: this.stagehand.logger, options: { messages: [ { role: "system", content: systemPrompt }, { role: "user", content: question }, ], image: { buffer: imageBuffer }, }, }); const rawResponse = response.choices[0].message.content; let evaluationResult: "YES" | "NO" | "INVALID" = "INVALID"; let reasoning = `Failed to process response. Raw response: ${rawResponse}`; try { // Clean potential markdown fences const cleanedResponse = rawResponse .replace(/^\`\`\`json\s*/, "") .replace(/\s*\`\`\`$/, "") .trim(); // Attempt to parse the JSON object const parsedResult: { evaluation: unknown; reasoning: unknown } = JSON.parse(cleanedResponse); // Validate structure if ( typeof parsedResult !== "object" || parsedResult === null || typeof parsedResult.evaluation !== "string" || typeof parsedResult.reasoning !== "string" ) { throw new LLMResponseError( "Evaluator", `Invalid JSON structure received: ${JSON.stringify(parsedResult)}`, ); } const evaluationString = parsedResult.evaluation.trim().toUpperCase(); reasoning = parsedResult.reasoning.trim(); // Update reasoning from parsed object // Use regex patterns to validate the evaluation string const isYes = this.yesPattern.test(evaluationString); const isNo = this.noPattern.test(evaluationString); if (isYes) { evaluationResult = "YES"; } else if (isNo) { evaluationResult = "NO"; } else { // Parsed JSON but evaluation value wasn't YES/NO variant if (strictResponse) { throw new LLMResponseError( "Evaluator", `Invalid evaluation value in JSON: ${parsedResult.evaluation}`, ); } // Keep INVALID, reasoning already updated reasoning = `Invalid evaluation value: ${parsedResult.evaluation}. Reasoning: ${reasoning}`; } } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); // Update reasoning with error details reasoning = `Processing error: ${errorMessage}. Raw response: ${rawResponse}`; if (strictResponse) { // Re-throw error if in strict mode throw new LLMResponseError("Evaluator", reasoning); } // Keep evaluationResult as "INVALID" } return { evaluation: evaluationResult, reasoning: reasoning, }; } /** * Evaluates the current state of the page against multiple questions in a single screenshot. * Returns an array of evaluation results. * * @param options - The options for batch evaluation * @returns A promise that resolves to an array of EvaluationResults * @throws Error if strictResponse is true and any response is not clearly YES or NO */ async batchEvaluate( options: BatchEvaluateOptions, ): Promise { const { questions, systemPrompt = `You are an expert evaluator that confidently returns YES or NO for each question given the state of a task in the screenshot. Provide a detailed reasoning for your answer. Return your response as a JSON array, where each object corresponds to a question and has the following format: { "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }`, screenshotDelayMs = 1000, strictResponse = false, } = options; // Wait for the specified delay before taking screenshot await new Promise((resolve) => setTimeout(resolve, screenshotDelayMs)); // Take a screenshot of the current page state const imageBuffer = await this.stagehand.page.screenshot(); // Create a numbered list of questions for the VLM const formattedQuestions = questions .map((q, i) => `${i + 1}. ${q}`) .join("\n"); // Get the LLM client with our preferred model const llmClient = this.stagehand.llmProvider.getClient( this.modelName, this.modelClientOptions, ); // Use the model-specific LLM client to evaluate the screenshot with all questions const response = await llmClient.createChatCompletion({ logger: this.stagehand.logger, options: { messages: [ { role: "system", content: `${systemPrompt}\n\nYou will be given multiple questions. Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`, }, { role: "user", content: formattedQuestions, }, ], image: { buffer: imageBuffer, }, }, }); const rawResponse = response.choices[0].message.content; let finalResults: EvaluationResult[] = []; try { // Clean potential markdown fences const cleanedResponse = rawResponse .replace(/^\`\`\`json\s*/, "") .replace(/\s*\`\`\`$/, "") .trim(); // Attempt to parse the JSON array const parsedResults: { evaluation: unknown; reasoning: unknown }[] = JSON.parse(cleanedResponse); if (!Array.isArray(parsedResults)) { throw new LLMResponseError( "Evaluator", "Response is not a JSON array.", ); } if (parsedResults.length !== questions.length && strictResponse) { throw new LLMResponseError( "Evaluator", `Expected ${questions.length} results, but got ${parsedResults.length}`, ); } for (let i = 0; i < questions.length; i++) { if (i < parsedResults.length) { const item = parsedResults[i]; // Ensure item is an object and has the required properties if ( typeof item !== "object" || item === null || typeof item.evaluation !== "string" || typeof item.reasoning !== "string" ) { if (strictResponse) { throw new LLMResponseError( "Evaluator", `Invalid object structure for question ${i + 1}: ${JSON.stringify(item)}`, ); } finalResults.push({ evaluation: "INVALID", reasoning: `Invalid object structure received: ${JSON.stringify( item, )}`, }); continue; // Move to the next question } // Use regex patterns for validation const evaluationString = item.evaluation.trim().toUpperCase(); const reasoning = item.reasoning.trim(); const isYes = this.yesPattern.test(evaluationString); const isNo = this.noPattern.test(evaluationString); if (isYes) { finalResults.push({ evaluation: "YES", reasoning: reasoning }); } else if (isNo) { finalResults.push({ evaluation: "NO", reasoning: reasoning }); } else { // Invalid evaluation value if (strictResponse) { throw new LLMResponseError( "Evaluator", `Invalid evaluation value for question ${i + 1}: ${item.evaluation}`, ); } finalResults.push({ evaluation: "INVALID", reasoning: `Invalid evaluation value: ${item.evaluation}. Reasoning: ${reasoning}`, }); } } else { // Missing result for this question if (strictResponse) { throw new LLMResponseError( "Evaluator", `No response found for question ${i + 1}`, ); } finalResults.push({ evaluation: "INVALID", reasoning: "No response found for this question.", }); } } } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); // If JSON parsing fails or structure is wrong, handle based on strictResponse if (strictResponse) { throw new LLMResponseError( "Evaluator", `Failed to parse LLM response or invalid format: ${rawResponse}. Error: ${errorMessage}`, ); } // Fallback: return INVALID for all questions finalResults = []; // Clear any potentially partially filled results for (let i = 0; i < questions.length; i++) { finalResults.push({ evaluation: "INVALID", reasoning: `Failed to parse response. Raw response: ${rawResponse}. Error: ${errorMessage}`, }); } } return finalResults; } } ``` ## /evals/index.eval.ts ```ts path="/evals/index.eval.ts" /** * This script orchestrates the running of evaluations against a set of tasks. * It uses Braintrust to run multiple testcases (each testcase representing a * given task-model combination) and then aggregates the results, producing * a summary of passes, failures, and categorized success rates. * * Overview: * - Reads a configuration file `evals.config.json` to determine what tasks (evaluations) * are available and which categories they belong to. * - Supports filtering which tasks to run either by evaluation category or by specific task name. * - Supports multiple models, defaulting to certain sets of models depending on the category. * - Runs each selected task against each selected model in parallel, collecting results. * - Saves a summary of the evaluation results to `eval-summary.json`. */ import fs from "fs"; import path from "path"; import process from "process"; import { DEFAULT_EVAL_CATEGORIES, filterByCategory, filterByEvalName, useTextExtract, } from "./args"; import { generateExperimentName } from "./utils"; import { exactMatch, errorMatch } from "./scoring"; import { tasksByName, tasksConfig, getModelList } from "./taskConfig"; import { Eval, wrapAISDKModel, wrapOpenAI } from "braintrust"; import { SummaryResult, Testcase } from "@/types/evals"; import { EvalLogger } from "./logger"; import { AvailableModel, LLMClient } from "@/dist"; import { env } from "./env"; import dotenv from "dotenv"; import { StagehandEvalError } from "@/types/stagehandErrors"; import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI"; import OpenAI from "openai"; import { initStagehand } from "./initStagehand"; import { AISdkClient } from "@/examples/external_clients/aisdk"; import { google } from "@ai-sdk/google"; import { anthropic } from "@ai-sdk/anthropic"; import { groq } from "@ai-sdk/groq"; import { cerebras } from "@ai-sdk/cerebras"; import { openai } from "@ai-sdk/openai"; dotenv.config(); /** * Read max concurrency and trial count from environment variables set in args.ts. * Fallback to defaults (20 and 5) if they're not provided. */ const MAX_CONCURRENCY = process.env.EVAL_MAX_CONCURRENCY ? parseInt(process.env.EVAL_MAX_CONCURRENCY, 10) : 3; const TRIAL_COUNT = process.env.EVAL_TRIAL_COUNT ? parseInt(process.env.EVAL_TRIAL_COUNT, 10) : 3; /** * generateSummary: * After all evaluations have finished, aggregate the results into a summary. * This summary includes: * - Which tasks passed or failed (with model and categories). * - Category-wise success percentages. * - Model-wise success percentages. * * The summary is written to `eval-summary.json` for further analysis. */ const generateSummary = async ( results: SummaryResult[], experimentName: string, ) => { // Determine passed testcases (those with _success: true) const passed = results .filter((r) => r.output._success) .map((r) => ({ eval: r.input.name, model: r.input.modelName, categories: tasksByName[r.input.name].categories, })); // Determine failed testcases (those with _success: false) const failed = results .filter((r) => !r.output._success) .map((r) => ({ eval: r.input.name, model: r.input.modelName, categories: tasksByName[r.input.name].categories, })); // Calculate success counts for each category const categorySuccessCounts: Record< string, { total: number; success: number } > = {}; for (const taskName of Object.keys(tasksByName)) { const taskCategories = tasksByName[taskName].categories; const taskResults = results.filter((r) => r.input.name === taskName); const successCount = taskResults.filter((r) => r.output._success).length; for (const cat of taskCategories) { if (!categorySuccessCounts[cat]) { categorySuccessCounts[cat] = { total: 0, success: 0 }; } categorySuccessCounts[cat].total += taskResults.length; categorySuccessCounts[cat].success += successCount; } } // Compute percentage success per category const categories: Record = {}; for (const [cat, counts] of Object.entries(categorySuccessCounts)) { categories[cat] = Math.round((counts.success / counts.total) * 100); } // Compute percentage success per model const models: Record = {}; const allModels = [...new Set(results.map((r) => r.input.modelName))]; for (const model of allModels) { const modelResults = results.filter((r) => r.input.modelName === model); const successCount = modelResults.filter((r) => r.output._success).length; models[model] = Math.round((successCount / modelResults.length) * 100); } // Format and write the summary to a JSON file const formattedSummary = { experimentName, passed, failed, categories, models, }; fs.writeFileSync( "eval-summary.json", JSON.stringify(formattedSummary, null, 2), ); console.log("Evaluation summary written to eval-summary.json"); }; /** * generateFilteredTestcases: * Based on the chosen filters (category or specific eval name) and environment, * this function generates the set of testcases to run. Each testcase is a combination * of a task and a model. * * Steps: * - Dynamically determine the list of models based on filters. * - Start with all combinations of tasks (from `tasksByName`) and the determined models. * - Filter by category if a category filter was specified. * - Filter by evaluation name if specified. * - In the BROWSERBASE environment, exclude certain tasks that are not suitable. */ const generateFilteredTestcases = (): Testcase[] => { let taskNamesToRun: string[]; let effectiveCategory: string | null = filterByCategory; // Start with the command-line filter if (filterByEvalName) { // If a specific task name is given, that's the only one we run taskNamesToRun = [filterByEvalName]; // Check if this single task belongs *only* to the agent category to override models const taskCategories = tasksByName[filterByEvalName]?.categories || []; if (taskCategories.length === 1 && taskCategories[0] === "agent") { // Treat this run as an 'agent' category run for model selection effectiveCategory = "agent"; console.log( `Task ${filterByEvalName} is agent-specific, using agent models.`, ); } } else if (filterByCategory) { // If filtering by category, get all tasks in that category taskNamesToRun = Object.keys(tasksByName).filter((name) => tasksByName[name].categories.includes(filterByCategory!), ); } else { // If no specific task or category filter, run tasks from default categories taskNamesToRun = Object.keys(tasksByName).filter((name) => DEFAULT_EVAL_CATEGORIES.some((category) => tasksByName[name].categories.includes(category), ), ); } // Dynamically determine the MODELS based on the effective category const currentModels = getModelList(effectiveCategory); console.log( `Using models for this run (${effectiveCategory || "default"}):`, currentModels, ); // Create a list of all testcases using the determined task names and models let allTestcases = currentModels.flatMap((model) => taskNamesToRun.map((testName) => ({ input: { name: testName, modelName: model as AvailableModel }, name: testName, tags: [ model, testName, ...(tasksConfig.find((t) => t.name === testName)?.categories || []).map( (x) => `category/${x}`, ), ], metadata: { model: model as AvailableModel, test: testName, categories: tasksConfig.find((t) => t.name === testName)?.categories, }, expected: true, })), ); // This filtering step might now be redundant if taskNamesToRun is already filtered if (filterByCategory) { allTestcases = allTestcases.filter((testcase) => tasksByName[testcase.name].categories.includes(filterByCategory!), ); } // If running in BROWSERBASE environment, exclude tasks that are not applicable. if (env === "BROWSERBASE") { allTestcases = allTestcases.filter( (testcase) => !["peeler_simple", "stock_x"].includes(testcase.name), ); } console.log( "Final test cases to run:", allTestcases .map( (t, i) => `${i}: ${t.name} (${t.input.modelName}): ${t.metadata.categories}`, ) .join("\n"), ); return allTestcases; }; /** * Main execution block: * - Determine experiment name * - Determine the project name (braintrustProjectName) based on CI or dev environment * - Run the Eval function with the given configuration: * * experimentName: A label for this run * * data: A function that returns the testcases to run * * task: A function that executes each task, given input specifying model and task name * * scores: An array of scoring functions * * maxConcurrency: Limit on parallel tasks * * trialCount: Number of trials (retries) per task * - Collect and summarize results using `generateSummary`. */ (async () => { // Generate a unique name for the experiment const experimentName: string = generateExperimentName({ evalName: filterByEvalName || undefined, category: filterByCategory || undefined, environment: env, }); // Determine braintrust project name to use (stagehand in CI, stagehand-dev otherwise) const braintrustProjectName = process.env.CI === "true" ? "stagehand" : "stagehand-dev"; try { // Run the evaluations with the braintrust Eval function const evalResult = await Eval(braintrustProjectName, { experimentName, data: generateFilteredTestcases, // Each test is a function that runs the corresponding task module task: async (input: { name: string; modelName: AvailableModel }) => { const logger = new EvalLogger(); try { // Dynamically import the task based on its name const taskModulePath = path.join( __dirname, "tasks", `${input.name}.ts`, ); // Check if file exists at direct path let taskModule; try { // First try to import directly (for backward compatibility) taskModule = await import(taskModulePath); } catch (error) { if (input.name.includes("/")) { // If the name includes a path separator, try to import from subdirectory const subDirPath = path.join( __dirname, "tasks", `${input.name}.ts`, ); try { taskModule = await import(subDirPath); } catch (subError) { throw new StagehandEvalError( `Failed to import task module for ${input.name}. Tried paths:\n` + `- ${taskModulePath}\n` + `- ${subDirPath}\n` + `Error: ${subError.message}`, ); } } else { throw new StagehandEvalError( `Failed to import task module for ${input.name} at path ${taskModulePath}: ${error.message}`, ); } } // Extract the task function const taskName = input.name.includes("/") ? input.name.split("/").pop() // Get the last part of the path for nested tasks : input.name; const taskFunction = taskModule[taskName]; if (typeof taskFunction !== "function") { throw new StagehandEvalError( `No Eval function found for task name: ${taskName} in module ${input.name}`, ); } let shouldUseTextExtract = useTextExtract; const categories = tasksByName[input.name].categories || []; const isRegression = categories.includes("regression"); const regressionExtractMethod = tasksByName[input.name].extractMethod; if (isRegression) { if (regressionExtractMethod) { shouldUseTextExtract = regressionExtractMethod === "textExtract"; } } // Execute the task let llmClient: LLMClient; if ( input.modelName.startsWith("gpt") || input.modelName.startsWith("o") ) { llmClient = new AISdkClient({ model: wrapAISDKModel(openai(input.modelName)), }); } else if (input.modelName.startsWith("gemini")) { llmClient = new AISdkClient({ model: wrapAISDKModel(google(input.modelName)), }); } else if (input.modelName.startsWith("claude")) { llmClient = new AISdkClient({ model: wrapAISDKModel(anthropic(input.modelName)), }); } else if (input.modelName.includes("groq")) { llmClient = new AISdkClient({ model: wrapAISDKModel( groq( input.modelName.substring(input.modelName.indexOf("/") + 1), ), ), }); } else if (input.modelName.includes("cerebras")) { llmClient = new AISdkClient({ model: wrapAISDKModel( cerebras( input.modelName.substring(input.modelName.indexOf("/") + 1), ), ), }); } else if (input.modelName.includes("/")) { llmClient = new CustomOpenAIClient({ modelName: input.modelName as AvailableModel, client: wrapOpenAI( new OpenAI({ apiKey: process.env.TOGETHER_AI_API_KEY, baseURL: "https://api.together.xyz/v1", }), ), }); } const taskInput = await initStagehand({ logger, llmClient, useTextExtract: shouldUseTextExtract, modelName: input.modelName, }); let result; try { result = await taskFunction(taskInput); // Log result to console if (result && result._success) { console.log(`✅ ${input.name}: Passed`); } else { console.log(`❌ ${input.name}: Failed`); } } finally { await taskInput.stagehand.close(); } return result; } catch (error) { // Log any errors that occur during task execution console.error(`❌ ${input.name}: Error - ${error}`); logger.error({ message: `Error in task ${input.name}`, level: 0, auxiliary: { error: { value: error.message, type: "string", }, trace: { value: error.stack, type: "string", }, }, }); return { _success: false, error: JSON.parse(JSON.stringify(error, null, 2)), logs: logger.getLogs(), }; } }, // Use the scoring functions defined above scores: [exactMatch, errorMatch], maxConcurrency: MAX_CONCURRENCY, trialCount: TRIAL_COUNT, }); // Map results to the SummaryResult format const summaryResults: SummaryResult[] = evalResult.results.map((result) => { const output = typeof result.output === "boolean" ? { _success: result.output } : result.output; return { input: result.input, output, name: result.input.name, score: output._success ? 1 : 0, }; }); // Generate and write the summary await generateSummary(summaryResults, experimentName); } catch (error) { console.error("Error during evaluation run:", error); process.exit(1); } })(); ``` ## /evals/initStagehand.ts ```ts path="/evals/initStagehand.ts" /** * This file provides a function to initialize a Stagehand instance for use in evaluations. * It configures the Stagehand environment and sets default options based on the current environment * (e.g., local or BROWSERBASE), caching preferences, and verbosity. It also establishes a logger for * capturing logs emitted by Stagehand. * * We create a central config object (`StagehandConfig`) that defines all parameters for Stagehand. * * The `initStagehand` function takes the model name, an optional DOM settling timeout, and an EvalLogger, * then uses these to override some default values before creating and initializing the Stagehand instance. */ import { enableCaching, env } from "./env"; import { ConstructorParams, LLMClient, Stagehand } from "@/dist"; import { EvalLogger } from "./logger"; import type { StagehandInitResult } from "@/types/evals"; import { AvailableModel } from "@/dist"; /** * StagehandConfig: * This configuration object follows a similar pattern to `examples/stagehand.config.ts`. * It sets the environment, verbosity, caching preferences, and other defaults. Some values, * like `apiKey` and `projectId`, can be defined via environment variables if needed. * * Adjust or remove fields as appropriate for your environment. */ const StagehandConfig = { env: env, apiKey: process.env.BROWSERBASE_API_KEY, projectId: process.env.BROWSERBASE_PROJECT_ID, verbose: 2 as const, debugDom: true, headless: false, enableCaching, domSettleTimeoutMs: 30_000, disablePino: true, browserbaseSessionCreateParams: { projectId: process.env.BROWSERBASE_PROJECT_ID!, browserSettings: { viewport: { width: 1024, height: 768, }, }, }, }; /** * Initializes a Stagehand instance for a given model: * - modelName: The model to use (overrides default in StagehandConfig) * - domSettleTimeoutMs: Optional timeout for DOM settling operations * - logger: An EvalLogger instance for capturing logs * * Returns: * - stagehand: The initialized Stagehand instance * - logger: The provided logger, associated with the Stagehand instance * - initResponse: Any response data returned by Stagehand initialization */ export const initStagehand = async ({ llmClient, domSettleTimeoutMs, logger, configOverrides, actTimeoutMs, useTextExtract, modelName, }: { llmClient: LLMClient; domSettleTimeoutMs?: number; logger: EvalLogger; configOverrides?: Partial; actTimeoutMs?: number; useTextExtract?: boolean; modelName: AvailableModel; }): Promise => { const config = { ...StagehandConfig, llmClient, ...(domSettleTimeoutMs && { domSettleTimeoutMs }), actTimeoutMs, ...configOverrides, logger: logger.log.bind(logger), }; const stagehand = new Stagehand(config); // Associate the logger with the Stagehand instance logger.init(stagehand); const { debugUrl, sessionUrl } = await stagehand.init(); return { stagehand, stagehandConfig: config, logger, debugUrl, sessionUrl, useTextExtract, modelName, }; }; ``` ## /evals/llm_clients/hn_aisdk.ts ```ts path="/evals/llm_clients/hn_aisdk.ts" import { Stagehand } from "@/dist"; import { AISdkClient } from "@/examples/external_clients/aisdk"; import { EvalFunction } from "@/types/evals"; import { openai } from "@ai-sdk/openai/dist"; import { z } from "zod"; export const hn_aisdk: EvalFunction = async ({ debugUrl, sessionUrl, stagehandConfig, logger, }) => { const stagehand = new Stagehand({ ...stagehandConfig, llmClient: new AISdkClient({ model: openai("gpt-4o-mini"), }), }); await stagehand.init(); await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/hackernews/", ); let { story } = await stagehand.page.extract({ instruction: "extract the title of the top story on the page", schema: z.object({ story: z.string().describe("the title of the top story on the page"), }), }); // remove the (url) part of the story title story = story.split(" (")[0]; const expectedStoryElement = await stagehand.page.$( "xpath=/html/body/center/table/tbody/tr[3]/td/table/tbody/tr[1]/td[3]/span/a", ); // remove the (url) part of the story title const expectedStory = (await expectedStoryElement?.textContent())?.split( " (", )?.[0]; if (!expectedStory) { logger.error({ message: "Could not find expected story element", level: 0, }); return { _success: false, error: "Could not find expected story element", debugUrl, sessionUrl, logs: logger.getLogs(), }; } if (story !== expectedStory) { logger.error({ message: "Extracted story does not match expected story", level: 0, auxiliary: { expected: { value: expectedStory, type: "string", }, actual: { value: story, type: "string", }, }, }); return { _success: false, error: "Extracted story does not match expected story", expectedStory, actualStory: story, debugUrl, sessionUrl, logs: logger.getLogs(), }; } await stagehand.page.act("Click on the 'new' tab"); if (stagehand.page.url() !== "https://news.ycombinator.com/newest") { logger.error({ message: "Page did not navigate to the 'new' tab", level: 0, auxiliary: { expected: { value: "https://news.ycombinator.com/newest", type: "string", }, actual: { value: stagehand.page.url(), type: "string", }, }, }); return { _success: false, error: "Page did not navigate to the 'new' tab", expectedUrl: "https://news.ycombinator.com/newest", actualUrl: stagehand.page.url(), debugUrl, sessionUrl, logs: logger.getLogs(), }; } await stagehand.close(); return { _success: true, expectedStory, actualStory: story, debugUrl, sessionUrl, logs: logger.getLogs(), }; }; ``` ## /evals/llm_clients/hn_customOpenAI.ts ```ts path="/evals/llm_clients/hn_customOpenAI.ts" import { EvalFunction } from "@/types/evals"; import { z } from "zod"; import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI"; import OpenAI from "openai"; import { Stagehand } from "@/dist"; export const hn_customOpenAI: EvalFunction = async ({ logger, stagehandConfig, debugUrl, sessionUrl, }) => { const stagehand = new Stagehand({ ...stagehandConfig, llmClient: new CustomOpenAIClient({ modelName: "gpt-4o-mini", client: new OpenAI({ apiKey: process.env.OPENAI_API_KEY, }), }), }); await stagehand.init(); await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/hackernews/", ); let { story } = await stagehand.page.extract({ instruction: "extract the title of the top story on the page", schema: z.object({ story: z.string().describe("the title of the top story on the page"), }), }); // remove the (url) part of the story title story = story.split(" (")[0]; const expectedStoryElement = await stagehand.page.$( "xpath=/html/body/center/table/tbody/tr[3]/td/table/tbody/tr[1]/td[3]/span/a", ); // remove the (url) part of the story title const expectedStory = (await expectedStoryElement?.textContent())?.split( " (", )?.[0]; if (!expectedStory) { logger.error({ message: "Could not find expected story element", level: 0, }); return { _success: false, error: "Could not find expected story element", debugUrl, sessionUrl, logs: logger.getLogs(), }; } if (story !== expectedStory) { logger.error({ message: "Extracted story does not match expected story", level: 0, auxiliary: { expected: { value: expectedStory, type: "string", }, actual: { value: story, type: "string", }, }, }); return { _success: false, error: "Extracted story does not match expected story", expectedStory, actualStory: story, debugUrl, sessionUrl, logs: logger.getLogs(), }; } await stagehand.page.act("Click on the 'new' tab"); if (stagehand.page.url() !== "https://news.ycombinator.com/newest") { logger.error({ message: "Page did not navigate to the 'new' tab", level: 0, auxiliary: { expected: { value: "https://news.ycombinator.com/newest", type: "string", }, actual: { value: stagehand.page.url(), type: "string", }, }, }); return { _success: false, error: "Page did not navigate to the 'new' tab", expectedUrl: "https://news.ycombinator.com/newest", actualUrl: stagehand.page.url(), debugUrl, sessionUrl, logs: logger.getLogs(), }; } await stagehand.close(); return { _success: true, expectedStory, actualStory: story, debugUrl, sessionUrl, logs: logger.getLogs(), }; }; ``` ## /evals/llm_clients/hn_langchain.ts ```ts path="/evals/llm_clients/hn_langchain.ts" import { EvalFunction } from "@/types/evals"; import { z } from "zod"; import { LangchainClient } from "@/examples/external_clients/langchain"; import { ChatOpenAI } from "@langchain/openai"; import { Stagehand } from "@/dist"; export const hn_langchain: EvalFunction = async ({ logger, stagehandConfig, debugUrl, sessionUrl, }) => { const stagehand = new Stagehand({ ...stagehandConfig, llmClient: new LangchainClient( new ChatOpenAI({ model: "gpt-4o-mini", }), ), }); await stagehand.init(); await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/hackernews/", ); let { story } = await stagehand.page.extract({ instruction: "extract the title of the top story on the page", schema: z.object({ story: z.string().describe("the title of the top story on the page"), }), }); // remove the (url) part of the story title story = story.split(" (")[0]; const expectedStoryElement = await stagehand.page.$( "xpath=/html/body/center/table/tbody/tr[3]/td/table/tbody/tr[1]/td[3]/span/a", ); // remove the (url) part of the story title const expectedStory = (await expectedStoryElement?.textContent())?.split( " (", )?.[0]; if (!expectedStory) { logger.error({ message: "Could not find expected story element", level: 0, }); return { _success: false, error: "Could not find expected story element", debugUrl, sessionUrl, logs: logger.getLogs(), }; } if (story !== expectedStory) { logger.error({ message: "Extracted story does not match expected story", level: 0, auxiliary: { expected: { value: expectedStory, type: "string", }, actual: { value: story, type: "string", }, }, }); return { _success: false, error: "Extracted story does not match expected story", expectedStory, actualStory: story, debugUrl, sessionUrl, logs: logger.getLogs(), }; } await stagehand.page.act("Click on the 'new' tab"); if (stagehand.page.url() !== "https://news.ycombinator.com/newest") { logger.error({ message: "Page did not navigate to the 'new' tab", level: 0, auxiliary: { expected: { value: "https://news.ycombinator.com/newest", type: "string", }, actual: { value: stagehand.page.url(), type: "string", }, }, }); return { _success: false, error: "Page did not navigate to the 'new' tab", expectedUrl: "https://news.ycombinator.com/newest", actualUrl: stagehand.page.url(), debugUrl, sessionUrl, logs: logger.getLogs(), }; } await stagehand.close(); return { _success: true, expectedStory, actualStory: story, debugUrl, sessionUrl, logs: logger.getLogs(), }; }; ``` ## /evals/logger.ts ```ts path="/evals/logger.ts" /** * This file defines the `EvalLogger` class, which is used to capture and manage * log lines during the evaluation process. The logger supports different log * levels (info, error, warn), stores logs in memory for later retrieval, and * also prints them to the console for immediate feedback. * * The `parseLogLine` function helps transform raw `LogLine` objects into a more * structured format (`LogLineEval`), making auxiliary data easier to understand * and analyze. By associating an `EvalLogger` instance with a `Stagehand` object, * all logs emitted during the evaluation process can be captured, persisted, and * reviewed after the tasks complete. */ import { logLineToString } from "./utils"; import { LogLineEval } from "@/types/evals"; import { Stagehand, LogLine } from "@/dist"; /** * parseLogLine: * Given a LogLine, attempts to parse its `auxiliary` field into a structured object. * If parsing fails, logs an error and returns the original line. * * The `auxiliary` field in the log line typically contains additional metadata about the log event. */ function parseLogLine(logLine: LogLine): LogLineEval { try { let parsedAuxiliary: Record | undefined; if (logLine.auxiliary) { parsedAuxiliary = {}; for (const [key, entry] of Object.entries(logLine.auxiliary)) { try { parsedAuxiliary[key] = entry.type === "object" ? JSON.parse(entry.value) : entry.value; } catch (parseError) { console.warn(`Failed to parse auxiliary entry ${key}:`, parseError); // If parsing fails, use the raw value parsedAuxiliary[key] = entry.value; } } } return { ...logLine, auxiliary: undefined, parsedAuxiliary, } as LogLineEval; } catch (e) { console.log("Error parsing log line", logLine); console.error(e); return logLine; } } /** * EvalLogger: * A logger class used during evaluations to capture and print log lines. * * Capabilities: * - Maintains an internal array of log lines (EvalLogger.logs) for later retrieval. * - Can be initialized with a Stagehand instance to provide consistent logging. * - Supports logging at different levels (info, error, warn). * - Each log line is converted to a string and printed to console for immediate feedback. * - Also keeps a structured version of the logs that can be returned for analysis or * included in evaluation output. */ export class EvalLogger { private logs: LogLineEval[] = []; stagehand?: Stagehand; constructor() { this.logs = []; } /** * init: * Associates this logger with a given Stagehand instance. * This allows the logger to provide additional context if needed. */ init(stagehand: Stagehand) { this.stagehand = stagehand; } /** * log: * Logs a message at the default (info) level. * Uses `logLineToString` to produce a readable output on the console, * and then stores the parsed log line in `this.logs`. */ log(logLine: LogLine) { console.log(logLineToString(logLine)); this.logs.push(parseLogLine(logLine)); } /** * error: * Logs an error message with `console.error` and stores it. * Useful for capturing and differentiating error-level logs. */ error(logLine: LogLine) { console.error(logLineToString(logLine)); this.logs.push(parseLogLine(logLine)); } /** * warn: * Logs a warning message with `console.warn` and stores it. * Helps differentiate warnings from regular info logs. */ warn(logLine: LogLine) { console.warn(logLineToString(logLine)); this.logs.push(parseLogLine(logLine)); } /** * getLogs: * Retrieves the array of stored log lines. * Useful for returning logs after a task completes, for analysis or debugging. */ getLogs(): LogLineEval[] { return this.logs || []; } } ``` ## /evals/scoring.ts ```ts path="/evals/scoring.ts" /** * This file implements scoring functions needed by braintrust. */ import { EvalArgs, EvalInput, EvalResult } from "@/types/evals"; /** * Scoring function: exactMatch * Given the arguments (including input, output, and expected result), * this returns a score of 1 if the result matches the expectation, and 0 otherwise. * * If "expected" is true, it checks if the output indicates success. * If "expected" is a boolean or an object with _success flag, * it checks if output is exactly that success condition. */ export function exactMatch( args: EvalArgs, ): EvalResult { console.log(`Task "${args.input.name}" returned: ${args.output}`); const expected = args.expected ?? true; if (expected === true) { // If we expect a success (true), then we check the output's _success flag. return { name: "Exact match", score: typeof args.output === "boolean" ? args.output ? 1 : 0 : args.output._success ? 1 : 0, }; } // If expected is not true, just directly compare the output to expected. return { name: "Exact match", score: args.output === expected ? 1 : 0, }; } /** * Scoring function: errorMatch * Determines if an error occurred in the task. * Scores 1 if an error is found, otherwise 0. */ export function errorMatch( args: EvalArgs< EvalInput, boolean | { _success: boolean; error?: unknown }, unknown >, ): EvalResult { console.log(`Task "${args.input.name}" returned: ${args.output}`); return { name: "Error rate", score: typeof args.output === "object" && args.output.error !== undefined ? 1 : 0, }; } ``` ## /evals/taskConfig.ts ```ts path="/evals/taskConfig.ts" /** * This file is responsible for: * - Loading and parsing the `evals.config.json` file, which defines tasks (evaluations) and their associated categories. * - Building a lookup structure (`tasksByName`) to map each task name to its categories. * - Filtering tasks based on command-line arguments (e.g., `filterByEvalName`) and ensuring that requested tasks exist. * - Determining which models to use for evaluations, depending on the category and environment variables. * - Validating that the chosen models are supported. * * The exported objects (`tasksByName`, `MODELS`, `config`) are used by the main evaluation script and other modules * to know which tasks and models are available, and to configure the evaluations accordingly. */ import fs from "fs"; import path from "path"; import { AvailableModel } from "@/dist"; import { filterByEvalName } from "./args"; const ALL_EVAL_MODELS = [ // GOOGLE "gemini-2.0-flash", "gemini-2.0-flash-lite", "gemini-1.5-flash", "gemini-2.5-pro-exp-03-25", "gemini-1.5-pro", "gemini-1.5-flash-8b", "gemini-2.5-flash-preview-04-17", "gemini-2.5-pro-preview-03-25", // ANTHROPIC "claude-3-5-sonnet-latest", "claude-3-7-sonnet-latest", // OPENAI "gpt-4o-mini", "gpt-4o", "gpt-4.5-preview", "o3", "o3-mini", "o4-mini", // TOGETHER - META "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", "meta-llama/Llama-3.3-70B-Instruct-Turbo", "meta-llama/Llama-4-Scout-17B-16E-Instruct", "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", // TOGETHER - DEEPSEEK "deepseek-ai/DeepSeek-V3", "Qwen/Qwen2.5-7B-Instruct-Turbo", // GROQ "groq/meta-llama/llama-4-scout-17b-16e-instruct", "groq/llama-3.3-70b-versatile", "groq/llama3-70b-8192", "groq/qwen-qwq-32b", "groq/qwen-2.5-32b", "groq/deepseek-r1-distill-qwen-32b", "groq/deepseek-r1-distill-llama-70b", // CEREBRAS "cerebras/llama3.3-70b", ]; // The configuration file `evals.config.json` contains a list of tasks and their associated categories. const configPath = path.join(__dirname, "evals.config.json"); const config = JSON.parse(fs.readFileSync(configPath, "utf-8")) satisfies { tasks: { name: string; categories: string[]; }[]; }; /** * The `tasksConfig` defines all tasks from the config file. Each task has a name and categories. * We create a mapping `tasksByName` from task name to its categories for quick lookup. */ type TaskConfig = { name: string; categories: string[]; extract_method?: string; }; const tasksConfig = config.tasks as TaskConfig[]; const tasksByName = tasksConfig.reduce< Record >((acc, task) => { acc[task.name] = { categories: task.categories, extractMethod: task.extract_method, }; return acc; }, {}); /** * If filtering by a specific eval name (task), ensure that this task actually exists. */ if (filterByEvalName && !tasksByName[filterByEvalName]) { console.error(`Error: Evaluation "${filterByEvalName}" does not exist.`); process.exit(1); } /** * Determine which models to run the evaluations against. * * DEFAULT_EVAL_MODELS: The default set of models used for most categories. */ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS ? process.env.EVAL_MODELS.split(",") : ["gemini-2.0-flash", "gpt-4.1-mini", "claude-3-5-sonnet-latest"]; const DEFAULT_AGENT_MODELS = process.env.EVAL_AGENT_MODELS ? process.env.EVAL_AGENT_MODELS.split(",") : ["computer-use-preview", "claude-3-7-sonnet-20250219"]; /** * getModelList: * Returns a list of models to be used for the given category. * If category is "experimental", it merges DEFAULT_EVAL_MODELS and EXPERIMENTAL_EVAL_MODELS. * Otherwise, returns DEFAULT_EVAL_MODELS filtered by provider if specified. */ const getModelList = (category?: string): string[] => { const provider = process.env.EVAL_PROVIDER?.toLowerCase(); if (category === "agent") { return DEFAULT_AGENT_MODELS; } if (provider) { return ALL_EVAL_MODELS.filter((model) => filterModelByProvider(model, provider), ); } // If no agent category and no provider, return default eval models return DEFAULT_EVAL_MODELS; }; // Helper function to contain the provider filtering logic const filterModelByProvider = (model: string, provider: string): boolean => { const modelLower = model.toLowerCase(); if (provider === "openai") { return modelLower.startsWith("gpt"); } else if (provider === "anthropic") { return modelLower.startsWith("claude"); } else if (provider === "google") { return modelLower.startsWith("gemini"); } else if (provider === "together") { return ( modelLower.startsWith("meta-llama") || modelLower.startsWith("llama") || modelLower.startsWith("deepseek") || modelLower.startsWith("qwen") ); } else if (provider === "groq") { return modelLower.startsWith("groq"); } else if (provider === "cerebras") { return modelLower.startsWith("cerebras"); } console.warn( `Unknown provider specified or model doesn't match: ${provider}`, ); return false; }; const MODELS: AvailableModel[] = getModelList().map((model) => { return model as AvailableModel; }); export { tasksByName, MODELS, tasksConfig, getModelList }; ``` ## /evals/tasks/agent/google_flights.ts ```ts path="/evals/tasks/agent/google_flights.ts" import { EvalFunction } from "@/types/evals"; import { Evaluator } from "../../evaluator"; export const google_flights: EvalFunction = async ({ debugUrl, sessionUrl, stagehand, logger, modelName, }) => { await stagehand.page.goto("https://google.com/travel/flights"); const agent = stagehand.agent({ model: modelName, provider: modelName.startsWith("claude") ? "anthropic" : "openai", instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. Today is ${new Date().toISOString().slice(0, 10)}. The current page is ${await stagehand.page.title()}`, }); const agentResult = await agent.execute({ instruction: "Search for flights from San Francisco to New York for next weekend", maxSteps: 15, }); logger.log(agentResult); const evaluator = new Evaluator(stagehand); const result = await evaluator.evaluate({ question: "Does the page show flights (options, available flights, not a search form) from San Francisco to New York?", strictResponse: true, }); if (result.evaluation !== "YES" && result.evaluation !== "NO") { await stagehand.close(); return { _success: false, observations: "Evaluator provided an invalid response", debugUrl, sessionUrl, logs: logger.getLogs(), }; } if (result.evaluation === "YES") { await stagehand.close(); return { _success: true, observations: result.reasoning, debugUrl, sessionUrl, logs: logger.getLogs(), }; } else { await stagehand.close(); return { _success: false, observations: result.reasoning, debugUrl, sessionUrl, logs: logger.getLogs(), }; } }; ``` ## /evals/tasks/agent/iframe_form.ts ```ts path="/evals/tasks/agent/iframe_form.ts" import { EvalFunction } from "@/types/evals"; import { Evaluator } from "../../evaluator"; export const iframe_form: EvalFunction = async ({ debugUrl, sessionUrl, stagehand, logger, modelName, }) => { await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/"); const agent = stagehand.agent({ provider: "anthropic", model: modelName, }); const agentResult = await agent.execute({ instruction: "Fill in the form name with 'John Smith'", maxSteps: 3, }); logger.log(agentResult); await stagehand.page.mouse.wheel(0, -1000); const evaluator = new Evaluator(stagehand); const result = await evaluator.evaluate({ question: "Is the form name input filled with 'John Smith'?", strictResponse: true, }); if (result.evaluation !== "YES" && result.evaluation !== "NO") { await stagehand.close(); return { _success: false, observations: "Evaluator provided an invalid response", debugUrl, sessionUrl, logs: logger.getLogs(), }; } const agentResult2 = await agent.execute({ instruction: "Fill in the form email with 'john.smith@example.com'", maxSteps: 3, }); logger.log(agentResult2); await stagehand.page.mouse.wheel(0, -1000); const result2 = await evaluator.evaluate({ question: "Is the form email input filled with 'john.smith@example.com'?", strictResponse: true, }); if (result2.evaluation !== "YES" && result2.evaluation !== "NO") { await stagehand.close(); return { _success: false, observations: "Evaluator provided an invalid response", debugUrl, sessionUrl, logs: logger.getLogs(), }; } if (result.evaluation === "YES" && result2.evaluation === "YES") { await stagehand.close(); return { _success: true, observations: "All fields were filled correctly", debugUrl, sessionUrl, logs: logger.getLogs(), }; } else { await stagehand.close(); return { _success: false, observations: "One or more fields were not filled correctly", debugUrl, sessionUrl, logs: logger.getLogs(), }; } }; ``` ## /evals/tasks/agent/iframe_form_multiple.ts ```ts path="/evals/tasks/agent/iframe_form_multiple.ts" import { EvalFunction } from "@/types/evals"; import { Evaluator } from "../../evaluator"; export const iframe_form_multiple: EvalFunction = async ({ debugUrl, sessionUrl, stagehand, logger, modelName, }) => { await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/"); const agent = stagehand.agent({ provider: modelName.startsWith("claude") ? "anthropic" : "openai", model: modelName, }); const agentResult = await agent.execute({ instruction: "Fill in the form name with 'John Smith', the email with 'john.smith@example.com', and select the 'Are you the domain owner?' option as 'No'", maxSteps: 10, }); logger.log(agentResult); await stagehand.page.mouse.wheel(0, -1000); const evaluator = new Evaluator(stagehand); const results = await evaluator.batchEvaluate({ questions: [ "Is the form name input filled with 'John Smith'?", "Is the form email input filled with 'john.smith@example.com'?", "Is the 'Are you the domain owner?' option selected as 'No'?", ], strictResponse: true, }); for (const r of results) { if (r.evaluation !== "YES" && r.evaluation !== "NO") { await stagehand.close(); return { _success: false, observations: "Evaluator provided an invalid response", debugUrl, sessionUrl, logs: logger.getLogs(), }; } if (r.evaluation === "NO") { await stagehand.close(); return { _success: false, observations: r.reasoning, debugUrl, sessionUrl, logs: logger.getLogs(), }; } } await stagehand.close(); return { _success: true, observations: "All fields were filled correctly", debugUrl, sessionUrl, logs: logger.getLogs(), }; }; ``` ## /evals/tasks/agent/sf_library_card.ts ```ts path="/evals/tasks/agent/sf_library_card.ts" import { EvalFunction } from "@/types/evals"; import { Evaluator } from "../../evaluator"; export const sf_library_card: EvalFunction = async ({ debugUrl, sessionUrl, stagehand, logger, modelName, }) => { await stagehand.page.goto("https://sflib1.sfpl.org/selfreg"); const agent = stagehand.agent({ model: modelName, provider: modelName.startsWith("claude") ? "anthropic" : "openai", instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`, }); const agentResult = await agent.execute({ instruction: "Fill in the 'Residential Address' field with '166 Geary St'", maxSteps: 3, }); logger.log(agentResult); await stagehand.page.mouse.wheel(0, -1000); const evaluator = new Evaluator(stagehand); const result = await evaluator.evaluate({ question: "Does the page show the 'Residential Address' field filled with '166 Geary St'?", strictResponse: true, }); if (result.evaluation !== "YES" && result.evaluation !== "NO") { await stagehand.close(); return { _success: false, observations: "Evaluator provided an invalid response", debugUrl, sessionUrl, logs: logger.getLogs(), }; } if (result.evaluation === "YES") { await stagehand.close(); return { _success: true, observations: result.reasoning, debugUrl, sessionUrl, logs: logger.getLogs(), }; } else { await stagehand.close(); return { _success: false, observations: result.reasoning, debugUrl, sessionUrl, logs: logger.getLogs(), }; } }; ``` ## /evals/tasks/agent/sf_library_card_multiple.ts ```ts path="/evals/tasks/agent/sf_library_card_multiple.ts" import { EvalFunction } from "@/types/evals"; import { Evaluator } from "../../evaluator"; export const sf_library_card_multiple: EvalFunction = async ({ debugUrl, sessionUrl, stagehand, logger, modelName, }) => { await stagehand.page.goto("https://sflib1.sfpl.org/selfreg"); const agent = stagehand.agent({ model: modelName, provider: modelName.startsWith("claude") ? "anthropic" : "openai", instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`, }); const agentResult = await agent.execute({ instruction: "Fill in ALL the required fields with mock data. DO NOT submit the form", maxSteps: 20, }); logger.log(agentResult); const evaluator = new Evaluator(stagehand); const result = await evaluator.evaluate({ question: "Does the page show all the required fields filled?", strictResponse: true, }); if (result.evaluation !== "YES" && result.evaluation !== "NO") { await stagehand.close(); return { _success: false, observations: "Evaluator provided an invalid response", debugUrl, sessionUrl, logs: logger.getLogs(), }; } if (result.evaluation === "YES") { await stagehand.close(); return { _success: true, observations: result.reasoning, debugUrl, sessionUrl, logs: logger.getLogs(), }; } else { await stagehand.close(); return { _success: false, observations: result.reasoning, debugUrl, sessionUrl, logs: logger.getLogs(), }; } }; ``` ## /evals/tasks/allrecipes.ts ```ts path="/evals/tasks/allrecipes.ts" import { EvalFunction } from "@/types/evals"; import { z } from "zod"; export const allrecipes: EvalFunction = async ({ logger, useTextExtract, debugUrl, sessionUrl, stagehand, }) => { await stagehand.page.goto("https://www.allrecipes.com/", { waitUntil: "domcontentloaded", }); await stagehand.page.act({ action: 'Type "chocolate chip cookies" in the search bar', }); await stagehand.page.act({ action: "press enter", }); const recipeDetails = await stagehand.page.extract({ instruction: "Extract the title of the first recipe and the total number of ratings it has received.", schema: z.object({ title: z.string().describe("Title of the recipe"), total_ratings: z .string() .describe("Total number of ratings for the recipe"), }), useTextExtract, }); await stagehand.close(); const { title, total_ratings } = recipeDetails; const expectedTitle = "Best Chocolate Chip Cookies"; const expectedRatings = 19164; const extractedRatings = parseInt(total_ratings.replace(/[^\d]/g, ""), 10); const isRatingsWithinRange = extractedRatings >= expectedRatings - 1000 && extractedRatings <= expectedRatings + 1000; if (title !== expectedTitle || !isRatingsWithinRange) { const errors = []; if (title !== expectedTitle) { errors.push({ message: "Extracted title does not match the expected title", expected: expectedTitle, actual: title, }); } if (!isRatingsWithinRange) { errors.push({ message: "Extracted ratings are not within the expected range", expected: `${expectedRatings} ± 1000`, actual: extractedRatings.toString(), }); } logger.error({ message: "Failed to extract correct recipe details", level: 0, auxiliary: { errors: { value: JSON.stringify(errors), type: "object", }, }, }); return { _success: false, error: "Recipe details extraction validation failed", logs: logger.getLogs(), debugUrl, sessionUrl, }; } return { _success: true, recipeDetails: { title, total_ratings: extractedRatings, }, logs: logger.getLogs(), debugUrl, sessionUrl, }; }; ``` ## /evals/tasks/amazon_add_to_cart.ts ```ts path="/evals/tasks/amazon_add_to_cart.ts" import { EvalFunction } from "@/types/evals"; export const amazon_add_to_cart: EvalFunction = async ({ logger, debugUrl, sessionUrl, stagehand, }) => { await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/amazon/", ); await stagehand.page.waitForTimeout(5000); await stagehand.page.act({ action: "click the 'Add to Cart' button", }); await stagehand.page.waitForTimeout(2000); await stagehand.page.act({ action: "click the 'Proceed to checkout' button", }); await stagehand.page.waitForTimeout(2000); const currentUrl = stagehand.page.url(); const expectedUrl = "https://browserbase.github.io/stagehand-eval-sites/sites/amazon/sign-in.html"; await stagehand.close(); return { _success: currentUrl === expectedUrl, currentUrl, debugUrl, sessionUrl, logs: logger.getLogs(), }; }; ``` ## /evals/tasks/apple.ts ```ts path="/evals/tasks/apple.ts" import { EvalFunction } from "@/types/evals"; export const apple: EvalFunction = async ({ logger, debugUrl, sessionUrl, stagehand, }) => { await stagehand.page.goto("https://www.apple.com/iphone-16-pro/"); await stagehand.page.act({ action: "click on the buy button" }); await stagehand.page.act({ action: "select the Pro Max model" }); await stagehand.page.act({ action: "select the natural titanium color" }); await stagehand.page.act({ action: "select the 256GB storage option" }); await stagehand.page.act({ action: "click on the 'select a smartphone' trade-in option", }); await stagehand.page.act({ action: "select the iPhone 13 mini model from the dropdown", }); await stagehand.page.act({ action: "select the iPhone 13 mini is in good condition", }); const successMessageLocator = stagehand.page.locator( 'text="Good News. Your iPhone 13 mini qualifies for credit."', ); const isVisible = await successMessageLocator.isVisible(); await stagehand.close(); return { _success: isVisible, debugUrl, sessionUrl, logs: logger.getLogs(), }; }; ``` ## /evals/tasks/arxiv.ts ```ts path="/evals/tasks/arxiv.ts" import { EvalFunction } from "@/types/evals"; import { z } from "zod"; export const arxiv: EvalFunction = async ({ logger, debugUrl, sessionUrl, stagehand, useTextExtract, }) => { try { await stagehand.page.goto("https://arxiv.org/search/"); await stagehand.page.act( "type web agents with multimodal models in the search bar", ); await stagehand.page.act("hit enter"); const paper_links = await stagehand.page.extract({ instruction: "extract the titles and links for two papers", schema: z.object({ papers: z .array( z.object({ title: z.string().describe("the title of the paper"), link: z.string().describe("the link to the paper").nullable(), }), ) .describe("list of papers"), }), useTextExtract, }); if ( !paper_links || !paper_links.papers || paper_links.papers.length === 0 ) { await stagehand.close(); return { _success: false, logs: logger.getLogs(), debugUrl, sessionUrl, }; } const papers = []; for (const paper of paper_links.papers) { if (paper.link) { await stagehand.page.goto(paper.link); const abstract = await stagehand.page.extract({ instruction: "extract details of the paper from the abstract", schema: z.object({ category: z .string() .describe( "the category of the paper. one of {'Benchmark', 'Dataset', 'Model', 'Framework', 'System', 'Other'}", ), problem: z .string() .describe( "summarize the problem that the paper is trying to solve in one sentence", ) .nullable(), methodology: z .string() .describe( "summarize the methodology of the paper in one sentence", ) .nullable(), results: z .string() .describe("summarize the results of the paper in one sentence") .nullable(), conclusion: z .string() .describe("summarize the conclusion of the paper in one sentence") .nullable(), code: z .string() .describe( "if provided, extract only the link to the code repository, without additional text. this is often optional and not always provided.", ) .nullable(), }), useTextExtract, }); papers.push({ title: paper.title, link: paper.link, ...abstract, }); } } if (!papers || papers.length === 0) { await stagehand.close(); return { _success: false, logs: logger.getLogs(), debugUrl, sessionUrl, }; } if (papers.length !== 2) { logger.error({ message: "incorrect number of papers extracted", level: 0, auxiliary: { expected: { value: "2", type: "integer", }, actual: { value: papers.length.toString(), type: "integer", }, }, }); await stagehand.close(); return { _success: false, error: "Incorrect number of papers extracted", logs: logger.getLogs(), debugUrl, sessionUrl, }; } // Ensure that every paper has a problem and methodology for (const paper of papers) { if (!paper.problem || !paper.methodology) { logger.error({ message: `paper missing problem or methodology`, level: 0, auxiliary: { paper: { value: JSON.stringify(paper), type: "object", }, }, }); await stagehand.close(); return { _success: false, error: "Incomplete paper information", logs: logger.getLogs(), debugUrl, sessionUrl, }; } } await stagehand.close(); return { _success: true, papers, logs: logger.getLogs(), debugUrl, sessionUrl, }; } catch (error) { logger.error({ message: `error in arxiv function`, level: 0, auxiliary: { error: { value: error.message, type: "string", }, trace: { value: error.stack, type: "string", }, }, }); await stagehand.close(); return { _success: false, logs: logger.getLogs(), debugUrl, sessionUrl, }; } }; ``` ## /evals/tasks/bidnet.ts ```ts path="/evals/tasks/bidnet.ts" import { EvalFunction } from "@/types/evals"; export const bidnet: EvalFunction = async ({ logger, debugUrl, sessionUrl, stagehand, }) => { await stagehand.page.goto("https://www.bidnetdirect.com/"); await stagehand.page.act({ action: 'Click on the "Construction" keyword', }); const expectedUrl = "https://www.bidnetdirect.com/public/solicitations/open?keywords=Construction"; const currentUrl = stagehand.page.url(); await stagehand.close(); return { _success: currentUrl.startsWith(expectedUrl), currentUrl, debugUrl, sessionUrl, logs: logger.getLogs(), }; }; ``` ## /evals/tasks/checkboxes.ts ```ts path="/evals/tasks/checkboxes.ts" import { EvalFunction } from "@/types/evals"; export const checkboxes: EvalFunction = async ({ debugUrl, sessionUrl, stagehand, logger, }) => { await stagehand.page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/checkboxes/", ); await stagehand.page.act({ action: "click the 'baseball' option", }); await stagehand.page.act({ action: "click the 'netball' option", }); const baseballChecked = await stagehand.page .locator('input[type="checkbox"][name="sports"][value="baseball"]') .isChecked(); const netballChecked = await stagehand.page .locator('input[type="checkbox"][name="sports"][value="netball"]') .isChecked(); await stagehand.close(); return { _success: baseballChecked && netballChecked, debugUrl, sessionUrl, logs: logger.getLogs(), }; }; ``` The content has been capped at 50000 tokens, and files over NaN bytes have been omitted. The user could consider applying other filters to refine the result. The better and more specific the context, the better the LLM can follow instructions. If the context seems verbose, the user can refine the filter using uithub. Thank you for using https://uithub.com - Perfect LLM context for any GitHub repo.