From 10427687dde34acb3ac855f8ea51e1782bc605d1 Mon Sep 17 00:00:00 2001
From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com>
Date: Sat, 21 Mar 2026 19:59:43 -0500
Subject: [PATCH 1/7] feat: add HTTP-first web fetch tool

---
 CHANGELOG.md                                  |   6 +-
 README.md                                     |  26 +
 lib/jido_browser.ex                           |  24 +
 lib/jido_browser/actions/web_fetch.ex         |  92 ++
 lib/jido_browser/plugin.ex                    | 107 +-
 lib/jido_browser/web_fetch.ex                 | 915 ++++++++++++++++++
 mix.exs                                       |   7 +-
 .../composite_actions_and_installer_test.exs  |  59 ++
 test/jido_browser/plugin_test.exs             |  44 +-
 test/jido_browser/web_fetch_test.exs          | 166 ++++
 10 files changed, 1418 insertions(+), 28 deletions(-)
 create mode 100644 lib/jido_browser/actions/web_fetch.ex
 create mode 100644 lib/jido_browser/web_fetch.ex
 create mode 100644 test/jido_browser/web_fetch_test.exs

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fb19c91..99820c3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- Add HTTP-first `Jido.Browser.web_fetch/2` and `Jido.Browser.Actions.WebFetch` for stateless page retrieval with domain policy, focused filtering, caching, and citation-ready passages
+
 ### Changed
 
 - Rename the public Elixir namespace from `JidoBrowser.*` to `Jido.Browser.*`
@@ -110,4 +114,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Refactoring:
 
-* streamline agent-browser runtime defaults by mikehostetler
\ No newline at end of file
+* streamline agent-browser runtime defaults by mikehostetler
diff --git a/README.md b/README.md
index b7204cc..c1d1808 100644
--- a/README.md
+++ b/README.md
@@ -78,6 +78,22 @@ Selectors remain supported, but ref-based interaction is the preferred 2.0 flow:
 2. act on `@eN` refs
 3. re-snapshot
 
+### Stateless Web Fetch
+
+```elixir
+{:ok, result} =
+  Jido.Browser.web_fetch(
+    "https://example.com/docs",
+    format: :markdown,
+    allowed_domains: ["example.com"],
+    focus_terms: ["API", "authentication"],
+    citations: true
+  )
+
+result.content
+result.passages
+```
+
 ### State Persistence
 
 ```elixir
@@ -143,6 +159,14 @@ config :jido_browser, :web,
   profile: "default"
 ```
 
+Optional web fetch settings:
+
+```elixir
+config :jido_browser, :web_fetch,
+  cache_ttl_ms: 300_000,
+  pdftotext_path: "/usr/local/bin/pdftotext"
+```
+
 ## Backends
 
 ### AgentBrowser (Default)
@@ -173,6 +197,7 @@ Core operations:
 - `type/4`
 - `screenshot/2`
 - `extract_content/2`
+- `web_fetch/2`
 - `evaluate/3`
 
 Agent-browser-native operations:
@@ -252,6 +277,7 @@ Agent-browser-native operations:
 - `ReadPage`
 - `SnapshotUrl`
 - `SearchWeb`
+- `WebFetch`
 
 ## Using With Jido Agents
 
diff --git a/lib/jido_browser.ex b/lib/jido_browser.ex
index 58f3e3a..91139ac 100644
--- a/lib/jido_browser.ex
+++ b/lib/jido_browser.ex
@@ -8,11 +8,13 @@ defmodule Jido.Browser do
 
   alias Jido.Browser.Error
   alias Jido.Browser.Session
+  alias Jido.Browser.WebFetch
 
   @default_adapter Jido.Browser.Adapters.AgentBrowser
   @default_timeout 30_000
   @supported_screenshot_formats [:png]
   @supported_extract_formats [:markdown, :html, :text]
+  @supported_web_fetch_formats [:markdown, :html, :text]
 
   @doc "Starts a browser session using the configured adapter or an explicit adapter override."
   @spec start_session(keyword()) :: {:ok, Session.t()} | {:error, term()}
@@ -107,6 +109,28 @@ defmodule Jido.Browser do
     end
   end
 
+  @doc "Fetches a URL over HTTP(S) without starting a browser session."
+  @spec web_fetch(String.t(), keyword()) :: {:ok, map()} | {:error, term()}
+  def web_fetch(url, opts \\ [])
+
+  def web_fetch(url, _opts) when url in [nil, ""] do
+    {:error, Error.invalid_error("URL cannot be nil or empty", %{url: url})}
+  end
+
+  def web_fetch(url, opts) when is_binary(url) do
+    format = opts[:format] || :markdown
+
+    if format in @supported_web_fetch_formats do
+      WebFetch.fetch(url, normalize_timeout(opts))
+    else
+      {:error,
+       Error.invalid_error("Unsupported web fetch format: #{inspect(format)}", %{
+         format: format,
+         supported: @supported_web_fetch_formats
+       })}
+    end
+  end
+
   @doc "Evaluates JavaScript in the browser when the adapter supports it."
   @spec evaluate(Session.t(), String.t(), keyword()) ::
           {:ok, Session.t(), map()} | {:error, term()}
diff --git a/lib/jido_browser/actions/web_fetch.ex b/lib/jido_browser/actions/web_fetch.ex
new file mode 100644
index 0000000..417f07d
--- /dev/null
+++ b/lib/jido_browser/actions/web_fetch.ex
@@ -0,0 +1,92 @@
+defmodule Jido.Browser.Actions.WebFetch do
+  @moduledoc """
+  Stateless HTTP-first page retrieval for agent workflows.
+
+  `WebFetch` is a lighter-weight alternative to browser navigation when the
+  target content can be retrieved over plain HTTP(S) without JavaScript
+  execution.
+  """
+
+  use Jido.Action,
+    name: "web_fetch",
+    description:
+      "Fetch a URL over HTTP(S) with domain policy controls, optional focused filtering, " <>
+        "approximate token caps, and citation-ready passages.",
+    category: "Browser",
+    tags: ["browser", "web", "fetch", "http", "retrieval"],
+    vsn: "2.0.0",
+    schema: [
+      url: [type: :string, required: true, doc: "The URL to fetch"],
+      format: [type: {:in, [:markdown, :text, :html]}, default: :markdown, doc: "Output format"],
+      selector: [type: :string, doc: "Optional CSS selector for HTML pages"],
+      allowed_domains: [type: {:list, :string}, default: [], doc: "Allow-list of host or host/path rules"],
+      blocked_domains: [type: {:list, :string}, default: [], doc: "Block-list of host or host/path rules"],
+      focus_terms: [type: {:list, :string}, default: [], doc: "Terms used to filter the fetched document"],
+      focus_window: [type: :integer, default: 0, doc: "Paragraph window around each focus match"],
+      max_content_tokens: [type: :integer, doc: "Approximate token cap for returned content"],
+      citations: [type: :boolean, default: false, doc: "Include citation-ready passage offsets"],
+      cache: [type: :boolean, default: true, doc: "Reuse cached fetch results when available"],
+      timeout: [type: :integer, doc: "Receive timeout in milliseconds"],
+      require_known_url: [type: :boolean, default: false, doc: "Require the URL to already be present in tool context"],
+      known_urls: [type: {:list, :string}, default: [], doc: "Additional known URLs accepted for provenance checks"],
+      max_uses: [type: :integer, doc: "Maximum successful web fetch calls allowed in current skill state"]
+    ]
+
+  alias Jido.Browser.Error
+
+  @impl true
+  def run(params, context) do
+    with :ok <- validate_max_uses(params, context),
+         {:ok, result} <- Jido.Browser.web_fetch(params.url, build_opts(params, context)) do
+      {:ok, Map.put(result, :status, "success")}
+    else
+      {:error, %_{} = error} ->
+        {:error, error}
+
+      {:error, reason} ->
+        {:error, Error.adapter_error("Web fetch failed", %{reason: reason})}
+    end
+  end
+
+  defp build_opts(params, context) do
+    known_urls =
+      (Map.get(params, :known_urls, []) || [])
+      |> Kernel.++(get_in(context, [:skill_state, :seen_urls]) || [])
+      |> Enum.uniq()
+
+    []
+    |> maybe_put(:format, Map.get(params, :format, :markdown))
+    |> maybe_put(:selector, params[:selector])
+    |> maybe_put(:allowed_domains, Map.get(params, :allowed_domains, []))
+    |> maybe_put(:blocked_domains, Map.get(params, :blocked_domains, []))
+    |> maybe_put(:focus_terms, Map.get(params, :focus_terms, []))
+    |> maybe_put(:focus_window, Map.get(params, :focus_window, 0))
+    |> maybe_put(:max_content_tokens, params[:max_content_tokens])
+    |> maybe_put(:citations, Map.get(params, :citations, false))
+    |> maybe_put(:cache, Map.get(params, :cache, true))
+    |> maybe_put(:timeout, params[:timeout])
+    |> maybe_put(:require_known_url, Map.get(params, :require_known_url, false))
+    |> maybe_put(:known_urls, known_urls)
+  end
+
+  defp validate_max_uses(%{max_uses: max_uses}, context) when is_integer(max_uses) and max_uses >= 0 do
+    current_uses = get_in(context, [:skill_state, :web_fetch_uses]) || 0
+
+    if current_uses >= max_uses do
+      {:error,
+       Error.invalid_error("Web fetch max uses exceeded", %{
+         error_code: :max_uses_exceeded,
+         max_uses: max_uses,
+         current_uses: current_uses
+       })}
+    else
+      :ok
+    end
+  end
+
+  defp validate_max_uses(_params, _context), do: :ok
+
+  defp maybe_put(opts, _key, nil), do: opts
+  defp maybe_put(opts, _key, []), do: opts
+  defp maybe_put(opts, key, value), do: Keyword.put(opts, key, value)
+end
diff --git a/lib/jido_browser/plugin.ex b/lib/jido_browser/plugin.ex
index 4e99408..2a58bba 100644
--- a/lib/jido_browser/plugin.ex
+++ b/lib/jido_browser/plugin.ex
@@ -36,6 +36,7 @@ require Jido.Browser.Actions.WaitForSelector
 require Jido.Browser.Actions.ReadPage
 require Jido.Browser.Actions.SnapshotUrl
 require Jido.Browser.Actions.SearchWeb
+require Jido.Browser.Actions.WebFetch
 
 defmodule Jido.Browser.Plugin do
   @moduledoc """
@@ -119,7 +120,8 @@ defmodule Jido.Browser.Plugin do
       # Self-contained composite actions (manage own session)
       Jido.Browser.Actions.ReadPage,
       Jido.Browser.Actions.SnapshotUrl,
-      Jido.Browser.Actions.SearchWeb
+      Jido.Browser.Actions.SearchWeb,
+      Jido.Browser.Actions.WebFetch
     ],
     description: "Browser automation for web navigation, interaction, and content extraction",
     category: "browser",
@@ -136,7 +138,9 @@ defmodule Jido.Browser.Plugin do
       viewport: Map.get(config, :viewport, %{width: 1280, height: 720}),
       base_url: Map.get(config, :base_url),
       last_url: nil,
-      last_title: nil
+      last_title: nil,
+      seen_urls: [],
+      web_fetch_uses: 0
     }
 
     {:ok, initial_state}
@@ -151,7 +155,9 @@ defmodule Jido.Browser.Plugin do
       viewport: Zoi.any(description: "Browser viewport dimensions") |> Zoi.optional(),
       base_url: Zoi.string(description: "Base URL for relative navigation") |> Zoi.optional(),
       last_url: Zoi.string(description: "Last navigated URL") |> Zoi.optional(),
-      last_title: Zoi.string(description: "Last page title") |> Zoi.optional()
+      last_title: Zoi.string(description: "Last page title") |> Zoi.optional(),
+      seen_urls: Zoi.array(Zoi.string(description: "Known URLs discovered during tool use")) |> Zoi.default([]),
+      web_fetch_uses: Zoi.integer(description: "Successful web fetch calls in current skill state") |> Zoi.default(0)
     })
   end
 
@@ -204,7 +210,8 @@ defmodule Jido.Browser.Plugin do
       # Self-contained composite actions
       {"browser.read_page", Jido.Browser.Actions.ReadPage},
       {"browser.snapshot_url", Jido.Browser.Actions.SnapshotUrl},
-      {"browser.search_web", Jido.Browser.Actions.SearchWeb}
+      {"browser.search_web", Jido.Browser.Actions.SearchWeb},
+      {"browser.web_fetch", Jido.Browser.Actions.WebFetch}
     ]
   end
 
@@ -214,22 +221,17 @@ defmodule Jido.Browser.Plugin do
   end
 
   @impl Jido.Plugin
-  def transform_result(_action, {:ok, result}, _context) when is_map(result) do
-    case Map.get(result, :session) do
-      %Jido.Browser.Session{} = session ->
-        current_url = Map.get(result, :url) || Map.get(result, "url") || get_in(session, [:connection, :current_url])
-        current_title = Map.get(result, :title) || Map.get(result, "title") || get_in(session, [:connection, :title])
-
-        state_updates = %{
-          session: session,
-          last_url: current_url,
-          last_title: current_title
-        }
-
-        {:ok, result, state_updates}
+  def transform_result(action, {:ok, result}, context) when is_map(result) do
+    state_updates =
+      %{}
+      |> maybe_put_session_state(result)
+      |> maybe_put_seen_urls(result, context)
+      |> maybe_increment_web_fetch_uses(action, context)
 
-      _ ->
-        {:ok, result}
+    if map_size(state_updates) == 0 do
+      {:ok, result}
+    else
+      {:ok, result, state_updates}
     end
   end
 
@@ -260,6 +262,70 @@ defmodule Jido.Browser.Plugin do
     end
   end
 
+  defp maybe_put_session_state(acc, result) do
+    case Map.get(result, :session) do
+      %Jido.Browser.Session{} = session ->
+        current_url = Map.get(result, :url) || Map.get(result, "url") || get_in(session, [:connection, :current_url])
+        current_title = Map.get(result, :title) || Map.get(result, "title") || get_in(session, [:connection, :title])
+
+        Map.merge(acc, %{
+          session: session,
+          last_url: current_url,
+          last_title: current_title
+        })
+
+      _ ->
+        acc
+    end
+  end
+
+  defp maybe_put_seen_urls(acc, result, context) do
+    current_seen_urls = get_in(context, [:skill_state, :seen_urls]) || []
+
+    seen_urls =
+      current_seen_urls
+      |> Kernel.++(extract_urls(result))
+      |> Enum.reject(&is_nil_or_empty/1)
+      |> Enum.uniq()
+
+    if seen_urls == [] or seen_urls == current_seen_urls do
+      acc
+    else
+      Map.put(acc, :seen_urls, seen_urls)
+    end
+  end
+
+  defp maybe_increment_web_fetch_uses(acc, Jido.Browser.Actions.WebFetch, context) do
+    current_uses = get_in(context, [:skill_state, :web_fetch_uses]) || 0
+    Map.put(acc, :web_fetch_uses, current_uses + 1)
+  end
+
+  defp maybe_increment_web_fetch_uses(acc, _action, _context), do: acc
+
+  defp extract_urls(result) do
+    direct_urls =
+      [Map.get(result, :url), Map.get(result, "url"), Map.get(result, :final_url), Map.get(result, "final_url")]
+      |> Enum.reject(&is_nil_or_empty/1)
+
+    search_urls =
+      result
+      |> Map.get(:results, Map.get(result, "results", []))
+      |> List.wrap()
+      |> Enum.map(fn item ->
+        cond do
+          is_map(item) -> Map.get(item, :url) || Map.get(item, "url")
+          true -> nil
+        end
+      end)
+      |> Enum.reject(&is_nil_or_empty/1)
+
+    direct_urls ++ search_urls
+  end
+
+  defp is_nil_or_empty(nil), do: true
+  defp is_nil_or_empty(""), do: true
+  defp is_nil_or_empty(_value), do: false
+
   def signal_patterns do
     [
       # Session lifecycle
@@ -308,7 +374,8 @@ defmodule Jido.Browser.Plugin do
       # Self-contained composite actions
       "browser.read_page",
       "browser.snapshot_url",
-      "browser.search_web"
+      "browser.search_web",
+      "browser.web_fetch"
     ]
   end
 end
diff --git a/lib/jido_browser/web_fetch.ex b/lib/jido_browser/web_fetch.ex
new file mode 100644
index 0000000..8599598
--- /dev/null
+++ b/lib/jido_browser/web_fetch.ex
@@ -0,0 +1,915 @@
+defmodule Jido.Browser.WebFetch do
+  @moduledoc """
+  Stateless HTTP-first web retrieval with optional domain policy, caching,
+  focused filtering, and citation-ready passage metadata.
+
+  This module is intended for document retrieval workloads where starting a full
+  browser session would be unnecessary or too expensive.
+  """
+
+  alias Jido.Browser.Error
+
+  @cache_table :jido_browser_web_fetch_cache
+  @default_timeout 15_000
+  @default_max_redirects 5
+  @default_cache_ttl_ms 300_000
+  @default_max_url_length 2_048
+  @supported_formats [:markdown, :text, :html]
+  @html_content_types ["text/html", "application/xhtml+xml"]
+  @text_content_types ["text/plain", "text/markdown", "text/csv", "text/xml", "application/xml"]
+  @pdf_content_types ["application/pdf"]
+
+  @type result :: %{
+          required(:url) => String.t(),
+          required(:final_url) => String.t(),
+          required(:content) => String.t(),
+          required(:format) => atom(),
+          required(:content_type) => String.t(),
+          required(:document_type) => atom(),
+          required(:retrieved_at) => String.t(),
+          required(:estimated_tokens) => non_neg_integer(),
+          required(:original_estimated_tokens) => non_neg_integer(),
+          required(:truncated) => boolean(),
+          required(:filtered) => boolean(),
+          required(:focus_matches) => non_neg_integer(),
+          required(:cached) => boolean(),
+          required(:citations) => %{enabled: boolean()},
+          required(:passages) => list(map()),
+          optional(:title) => String.t() | nil
+        }
+
+  @doc """
+  Fetches a URL over HTTP(S) and returns normalized document content.
+
+  Supported options:
+  - `:format` - `:markdown`, `:text`, or `:html`
+  - `:selector` - CSS selector for HTML pages
+  - `:allowed_domains` / `:blocked_domains` - mutually exclusive host/path rules
+  - `:max_content_tokens` - approximate token cap
+  - `:citations` - boolean, when true include passage spans
+  - `:focus_terms` - list of terms used for focused filtering
+  - `:focus_window` - paragraph window around focus matches
+  - `:timeout` - receive timeout in milliseconds
+  - `:cache` - enable ETS cache, defaults to `true`
+  - `:cache_ttl_ms` - cache TTL in milliseconds
+  - `:require_known_url` / `:known_urls` - optional URL provenance guard
+  """
+  @spec fetch(String.t(), keyword()) :: {:ok, result()} | {:error, Exception.t()}
+  def fetch(url, opts \\ [])
+
+  def fetch(url, opts) when is_binary(url) and is_list(opts) do
+    with {:ok, opts} <- normalize_opts(opts),
+         {:ok, normalized_url, uri} <- validate_url(url, opts),
+         :ok <- validate_known_url(normalized_url, opts),
+         :ok <- validate_domain_filters(uri, opts) do
+      case fetch_cached(normalized_url, opts) do
+        {:ok, result} ->
+          {:ok, result}
+
+        :miss ->
+          do_fetch(normalized_url, opts)
+      end
+    end
+  end
+
+  def fetch(url, _opts) do
+    {:error, Error.invalid_error("URL must be a non-empty string", %{error_code: :invalid_input, url: url})}
+  end
+
+  @doc false
+  @spec clear_cache() :: :ok
+  def clear_cache do
+    case :ets.whereis(@cache_table) do
+      :undefined ->
+        :ok
+
+      table ->
+        :ets.delete_all_objects(table)
+        :ok
+    end
+  end
+
+  defp do_fetch(url, opts) do
+    request_opts = [
+      url: url,
+      headers: request_headers(),
+      receive_timeout: opts[:timeout],
+      redirect: true,
+      max_redirects: opts[:max_redirects]
+    ]
+
+    case Req.run(request_opts) do
+      {%Req.Request{} = request, %Req.Response{} = response} ->
+        with :ok <- validate_http_status(response, url),
+             {:ok, final_url, final_uri} <- normalize_final_url(request),
+             :ok <- validate_domain_filters(final_uri, opts),
+             {:ok, result} <- build_result(url, final_url, response, opts) do
+          maybe_store_cache(url, opts, result)
+          {:ok, result}
+        end
+
+      {_request, %Req.TransportError{} = exception} ->
+        {:error, Error.adapter_error("Web fetch request failed", %{error_code: :url_not_accessible, reason: exception})}
+
+      {_request, %Req.TooManyRedirectsError{} = exception} ->
+        {:error,
+         Error.adapter_error("Web fetch exceeded redirect limit", %{error_code: :url_not_accessible, reason: exception})}
+
+      {_request, %_{} = exception} ->
+        {:error, Error.adapter_error("Web fetch failed", %{error_code: :unavailable, reason: exception})}
+
+      {_request, reason} ->
+        {:error, Error.adapter_error("Web fetch failed", %{error_code: :unavailable, reason: reason})}
+    end
+  end
+
+  defp build_result(url, final_url, response, opts) do
+    content_type = response_content_type(response)
+
+    cond do
+      content_type in @html_content_types ->
+        build_html_result(url, final_url, response.body, content_type, opts)
+
+      content_type in @pdf_content_types ->
+        build_pdf_result(url, final_url, response.body, content_type, opts)
+
+      text_content_type?(content_type) ->
+        build_text_result(url, final_url, response.body, content_type, opts)
+
+      true ->
+        {:error,
+         Error.adapter_error("Unsupported content type for web fetch", %{
+           error_code: :unsupported_content_type,
+           content_type: content_type
+         })}
+    end
+  end
+
+  defp build_html_result(url, final_url, body, content_type, opts) when is_binary(body) do
+    selector = opts[:selector]
+
+    with {:ok, document} <- parse_document(body),
+         {:ok, html} <- select_html(document, body, selector),
+         {:ok, title} <- extract_title(document),
+         {:ok, content} <- format_html(html, opts[:format], opts),
+         {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts),
+         {final_content, truncated, original_estimated_tokens} <-
+           maybe_truncate(filtered_content, opts[:max_content_tokens]) do
+      {:ok,
+       build_response(
+         url,
+         final_url,
+         final_content,
+         title,
+         content_type,
+         :html,
+         opts,
+         truncated,
+         filtered,
+         focus_matches,
+         original_estimated_tokens
+       )}
+    end
+  end
+
+  defp build_html_result(_url, _final_url, body, content_type, _opts) do
+    {:error,
+     Error.adapter_error("Unexpected response body for HTML fetch", %{
+       error_code: :unavailable,
+       content_type: content_type,
+       body: body
+     })}
+  end
+
+  defp build_text_result(url, final_url, body, content_type, opts) when is_binary(body) do
+    if opts[:selector] do
+      {:error,
+       Error.invalid_error("Selector filtering is only supported for HTML content", %{
+         error_code: :invalid_input,
+         selector: opts[:selector],
+         content_type: content_type
+       })}
+    else
+      with {:ok, content} <- format_text(body, opts[:format]),
+           {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts),
+           {final_content, truncated, original_estimated_tokens} <-
+             maybe_truncate(filtered_content, opts[:max_content_tokens]) do
+        {:ok,
+         build_response(
+           url,
+           final_url,
+           final_content,
+           nil,
+           content_type,
+           :text,
+           opts,
+           truncated,
+           filtered,
+           focus_matches,
+           original_estimated_tokens
+         )}
+      end
+    end
+  end
+
+  defp build_text_result(_url, _final_url, body, content_type, _opts) do
+    {:error,
+     Error.adapter_error("Unexpected response body for text fetch", %{
+       error_code: :unavailable,
+       content_type: content_type,
+       body: body
+     })}
+  end
+
+  defp build_pdf_result(url, final_url, body, content_type, opts) when is_binary(body) do
+    cond do
+      opts[:selector] ->
+        {:error,
+         Error.invalid_error("Selector filtering is not supported for PDF content", %{
+           error_code: :invalid_input,
+           selector: opts[:selector],
+           content_type: content_type
+         })}
+
+      opts[:format] == :html ->
+        {:error,
+         Error.invalid_error("HTML output is not supported for PDF content", %{
+           error_code: :invalid_input,
+           format: :html,
+           content_type: content_type
+         })}
+
+      true ->
+        with {:ok, text} <- extract_pdf_text(body),
+             {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(text, opts),
+             {final_content, truncated, original_estimated_tokens} <-
+               maybe_truncate(filtered_content, opts[:max_content_tokens]) do
+          {:ok,
+           build_response(
+             url,
+             final_url,
+             final_content,
+             title_from_url(final_url),
+             content_type,
+             :pdf,
+             opts,
+             truncated,
+             filtered,
+             focus_matches,
+             original_estimated_tokens
+           )}
+        end
+    end
+  end
+
+  defp build_pdf_result(_url, _final_url, body, content_type, _opts) do
+    {:error,
+     Error.adapter_error("Unexpected response body for PDF fetch", %{
+       error_code: :unavailable,
+       content_type: content_type,
+       body: body
+     })}
+  end
+
+  defp build_response(
+         url,
+         final_url,
+         content,
+         title,
+         content_type,
+         document_type,
+         opts,
+         truncated,
+         filtered,
+         focus_matches,
+         original_estimated_tokens
+       ) do
+    passages = maybe_build_passages(content, title, final_url, opts[:citations])
+
+    %{
+      url: url,
+      final_url: final_url,
+      title: title,
+      content: content,
+      format: opts[:format],
+      content_type: content_type,
+      document_type: document_type,
+      retrieved_at: retrieved_at(),
+      estimated_tokens: estimate_tokens(content),
+      original_estimated_tokens: original_estimated_tokens,
+      truncated: truncated,
+      filtered: filtered,
+      focus_matches: focus_matches,
+      cached: false,
+      citations: %{enabled: opts[:citations]},
+      passages: passages
+    }
+  end
+
+  defp normalize_opts(opts) do
+    format = opts[:format] || :markdown
+    citations = normalize_citations(opts[:citations])
+    focus_terms = normalize_focus_terms(opts[:focus_terms])
+
+    cond do
+      format not in @supported_formats ->
+        {:error,
+         Error.invalid_error("Unsupported web fetch format", %{
+           error_code: :invalid_input,
+           format: format,
+           supported_formats: @supported_formats
+         })}
+
+      present_domain_rules?(opts[:allowed_domains]) and present_domain_rules?(opts[:blocked_domains]) ->
+        {:error,
+         Error.invalid_error("Use either allowed_domains or blocked_domains, not both", %{
+           error_code: :invalid_input
+         })}
+
+      format == :html and focus_terms != [] ->
+        {:error,
+         Error.invalid_error("Focused filtering is only supported for markdown and text output", %{
+           error_code: :invalid_input,
+           format: format
+         })}
+
+      true ->
+        normalized =
+          opts
+          |> Keyword.put(:format, format)
+          |> Keyword.put(:citations, citations)
+          |> Keyword.put(:focus_terms, focus_terms)
+          |> Keyword.put_new(:focus_window, 0)
+          |> Keyword.put_new(:timeout, config(:timeout, @default_timeout))
+          |> Keyword.put_new(:max_redirects, @default_max_redirects)
+          |> Keyword.put_new(:cache, true)
+          |> Keyword.put_new(:cache_ttl_ms, config(:cache_ttl_ms, @default_cache_ttl_ms))
+          |> Keyword.put_new(:known_urls, [])
+
+        {:ok, normalized}
+    end
+  end
+
+  defp validate_url(url, opts) do
+    normalized_url = String.trim(url)
+    max_url_length = opts[:max_url_length] || @default_max_url_length
+
+    cond do
+      normalized_url == "" ->
+        {:error, Error.invalid_error("URL cannot be empty", %{error_code: :invalid_input})}
+
+      String.length(normalized_url) > max_url_length ->
+        {:error,
+         Error.invalid_error("URL exceeds maximum length", %{
+           error_code: :url_too_long,
+           max_url_length: max_url_length
+         })}
+
+      true ->
+        uri = URI.parse(normalized_url)
+
+        cond do
+          uri.scheme not in ["http", "https"] ->
+            {:error,
+             Error.invalid_error("Web fetch only supports http and https URLs", %{
+               error_code: :invalid_input,
+               scheme: uri.scheme
+             })}
+
+          is_nil(uri.host) or uri.host == "" ->
+            {:error, Error.invalid_error("URL must include a host", %{error_code: :invalid_input})}
+
+          not ascii_only?(uri.host) ->
+            {:error,
+             Error.invalid_error("Web fetch only accepts ASCII hostnames", %{
+               error_code: :url_not_allowed,
+               host: uri.host
+             })}
+
+          true ->
+            {:ok, URI.to_string(uri), normalize_uri(uri)}
+        end
+    end
+  end
+
+  defp validate_known_url(url, opts) do
+    known_urls =
+      opts[:known_urls]
+      |> List.wrap()
+      |> Enum.map(&normalize_known_url/1)
+      |> Enum.reject(&is_nil/1)
+
+    if not Keyword.get(opts, :require_known_url, false) do
+      :ok
+    else
+      if url in known_urls do
+        :ok
+      else
+        {:error,
+         Error.invalid_error("Web fetch URL must already be present in tool context", %{
+           error_code: :url_not_allowed,
+           url: url
+         })}
+      end
+    end
+  end
+
+  defp validate_domain_filters(%URI{} = uri, opts) do
+    with {:ok, allowed_rules} <- normalize_domain_rules(opts[:allowed_domains]),
+         {:ok, blocked_rules} <- normalize_domain_rules(opts[:blocked_domains]) do
+      cond do
+        allowed_rules != [] and not Enum.any?(allowed_rules, &rule_matches?(&1, uri)) ->
+          {:error,
+           Error.invalid_error("URL is not permitted by allowed_domains", %{
+             error_code: :url_not_allowed,
+             url: URI.to_string(uri)
+           })}
+
+        blocked_rules != [] and Enum.any?(blocked_rules, &rule_matches?(&1, uri)) ->
+          {:error,
+           Error.invalid_error("URL is blocked by blocked_domains", %{
+             error_code: :url_not_allowed,
+             url: URI.to_string(uri)
+           })}
+
+        true ->
+          :ok
+      end
+    end
+  end
+
+  defp normalize_domain_rules(nil), do: {:ok, []}
+
+  defp normalize_domain_rules(rules) do
+    rules
+    |> List.wrap()
+    |> Enum.reduce_while({:ok, []}, fn rule, {:ok, acc} ->
+      case normalize_domain_rule(rule) do
+        {:ok, normalized} -> {:cont, {:ok, [normalized | acc]}}
+        {:error, reason} -> {:halt, {:error, reason}}
+      end
+    end)
+    |> case do
+      {:ok, normalized} -> {:ok, Enum.reverse(normalized)}
+      error -> error
+    end
+  end
+
+  defp normalize_domain_rule(rule) when is_binary(rule) do
+    normalized = String.trim(rule)
+
+    cond do
+      normalized == "" ->
+        {:error, Error.invalid_error("Domain rules cannot be empty", %{error_code: :invalid_input})}
+
+      String.contains?(normalized, "://") ->
+        {:error,
+         Error.invalid_error("Domain rules must not include URL schemes", %{
+           error_code: :invalid_input,
+           rule: normalized
+         })}
+
+      true ->
+        uri = URI.parse("https://" <> normalized)
+        host = String.downcase(uri.host || "")
+        path = uri.path || "/"
+
+        cond do
+          host == "" ->
+            {:error,
+             Error.invalid_error("Domain rule must include a host", %{error_code: :invalid_input, rule: normalized})}
+
+          not ascii_only?(host) ->
+            {:error,
+             Error.invalid_error("Domain rules must use ASCII hosts", %{
+               error_code: :invalid_input,
+               rule: normalized
+             })}
+
+          true ->
+            {:ok, %{host: host, path: normalize_rule_path(path)}}
+        end
+    end
+  end
+
+  defp normalize_domain_rule(rule) do
+    {:error, Error.invalid_error("Domain rule must be a string", %{error_code: :invalid_input, rule: rule})}
+  end
+
+  defp rule_matches?(%{host: host, path: path}, %URI{host: uri_host} = uri) do
+    uri_host = String.downcase(uri_host || "")
+    request_path = normalize_rule_path(uri.path || "/")
+
+    host_matches? = uri_host == host or String.ends_with?(uri_host, "." <> host)
+    path_matches? = path == "/" or String.starts_with?(request_path, path)
+
+    host_matches? and path_matches?
+  end
+
+  defp normalize_final_url(%Req.Request{url: %URI{} = uri}) do
+    normalized = normalize_uri(uri)
+    {:ok, URI.to_string(normalized), normalized}
+  end
+
+  defp validate_http_status(%Req.Response{status: status}, _url) when status in 200..299, do: :ok
+
+  defp validate_http_status(%Req.Response{status: 429}, _url) do
+    {:error, Error.adapter_error("Web fetch rate limited", %{error_code: :too_many_requests, status: 429})}
+  end
+
+  defp validate_http_status(%Req.Response{status: status}, url) do
+    {:error,
+     Error.adapter_error("Web fetch returned an HTTP error", %{
+       error_code: :url_not_accessible,
+       status: status,
+       url: url
+     })}
+  end
+
+  defp parse_document(body) do
+    case Floki.parse_document(body) do
+      {:ok, document} ->
+        {:ok, document}
+
+      {:error, reason} ->
+        {:error, Error.adapter_error("Failed to parse fetched HTML", %{error_code: :unavailable, reason: reason})}
+    end
+  end
+
+  defp select_html(_document, body, nil), do: {:ok, body}
+  defp select_html(document, _body, ""), do: select_html(document, nil, nil)
+
+  defp select_html(document, _body, selector) do
+    nodes = Floki.find(document, selector)
+
+    if nodes == [] do
+      {:error,
+       Error.invalid_error("Selector did not match any elements in fetched HTML", %{
+         error_code: :invalid_input,
+         selector: selector
+       })}
+    else
+      {:ok, Floki.raw_html(nodes)}
+    end
+  end
+
+  defp extract_title(document) do
+    title =
+      document
+      |> Floki.find("title")
+      |> Floki.text(sep: " ")
+      |> String.trim()
+      |> blank_to_nil()
+
+    {:ok, title}
+  end
+
+  defp format_html(html, :html, _opts), do: {:ok, html}
+
+  defp format_html(html, :text, _opts) do
+    with {:ok, fragment} <- parse_fragment(html) do
+      {:ok, fragment |> Floki.text(sep: "\n") |> String.trim()}
+    end
+  end
+
+  defp format_html(html, :markdown, _opts) do
+    {:ok, Html2Markdown.convert(html) |> String.trim()}
+  rescue
+    error ->
+      {:error,
+       Error.adapter_error("Failed to convert fetched HTML to markdown", %{error_code: :unavailable, reason: error})}
+  end
+
+  defp format_text(text, :text), do: {:ok, String.trim(text)}
+  defp format_text(text, :markdown), do: {:ok, String.trim(text)}
+
+  defp format_text(_text, :html) do
+    {:error,
+     Error.invalid_error("HTML output is only supported for HTML content", %{
+       error_code: :invalid_input
+     })}
+  end
+
+  defp parse_fragment(html) do
+    case Floki.parse_fragment(html) do
+      {:ok, fragment} ->
+        {:ok, fragment}
+
+      {:error, reason} ->
+        {:error,
+         Error.adapter_error("Failed to parse fetched HTML fragment", %{error_code: :unavailable, reason: reason})}
+    end
+  end
+
+  defp maybe_filter_content(content, opts) do
+    case opts[:focus_terms] do
+      [] ->
+        {:ok, content, false, 0}
+
+      terms ->
+        sections = split_sections(content)
+        downcased_terms = Enum.map(terms, &String.downcase/1)
+
+        matching_indexes =
+          sections
+          |> Enum.with_index()
+          |> Enum.flat_map(fn {section, index} ->
+            lowered = String.downcase(section)
+
+            if Enum.any?(downcased_terms, &String.contains?(lowered, &1)) do
+              [index]
+            else
+              []
+            end
+          end)
+
+        window = max(opts[:focus_window] || 0, 0)
+
+        kept_indexes =
+          matching_indexes
+          |> Enum.flat_map(fn index -> (index - window)..(index + window) end)
+          |> Enum.filter(&(&1 >= 0 and &1 < length(sections)))
+          |> Enum.uniq()
+          |> Enum.sort()
+
+        filtered_content =
+          kept_indexes
+          |> Enum.map(&Enum.at(sections, &1))
+          |> Enum.reject(&(&1 == ""))
+          |> Enum.join("\n\n")
+          |> String.trim()
+
+        {:ok, filtered_content, true, length(matching_indexes)}
+    end
+  end
+
+  defp maybe_truncate(content, nil), do: {content, false, estimate_tokens(content)}
+
+  defp maybe_truncate(content, max_content_tokens) when is_integer(max_content_tokens) and max_content_tokens > 0 do
+    original_estimated_tokens = estimate_tokens(content)
+
+    if original_estimated_tokens <= max_content_tokens do
+      {content, false, original_estimated_tokens}
+    else
+      char_limit = max_content_tokens * 4
+      truncated = String.slice(content, 0, char_limit) |> String.trim()
+      {truncated, true, original_estimated_tokens}
+    end
+  end
+
+  defp maybe_truncate(content, _other), do: {content, false, estimate_tokens(content)}
+
+  defp maybe_build_passages(_content, _title, _url, false), do: []
+
+  defp maybe_build_passages(content, title, url, true) do
+    content
+    |> split_sections()
+    |> Enum.reject(&(&1 == ""))
+    |> Enum.reduce({[], 0, 0}, fn section, {passages, cursor, index} ->
+      start_char = cursor
+      end_char = start_char + String.length(section)
+
+      passage = %{
+        index: index,
+        start_char: start_char,
+        end_char: end_char,
+        text: section,
+        title: title,
+        url: url
+      }
+
+      {[passage | passages], end_char + 2, index + 1}
+    end)
+    |> elem(0)
+    |> Enum.reverse()
+    |> Enum.take(50)
+  end
+
+  defp split_sections(content) do
+    content
+    |> String.split(~r/\n\s*\n+/, trim: true)
+    |> case do
+      [] -> [String.trim(content)]
+      sections -> Enum.map(sections, &String.trim/1)
+    end
+  end
+
+  defp extract_pdf_text(bytes) do
+    case pdftotext_path() do
+      nil ->
+        {:error,
+         Error.adapter_error("PDF extraction requires pdftotext to be installed", %{
+           error_code: :unsupported_content_type,
+           content_type: "application/pdf"
+         })}
+
+      binary ->
+        with_tmp_files("jido_browser_web_fetch", ".pdf", ".txt", fn pdf_path, txt_path ->
+          File.write!(pdf_path, bytes)
+
+          case System.cmd(binary, ["-layout", "-nopgbrk", pdf_path, txt_path], stderr_to_stdout: true) do
+            {_output, 0} ->
+              case File.read(txt_path) do
+                {:ok, text} ->
+                  {:ok, String.trim(text)}
+
+                {:error, reason} ->
+                  {:error,
+                   Error.adapter_error("Failed to read extracted PDF text", %{error_code: :unavailable, reason: reason})}
+              end
+
+            {output, status} ->
+              {:error,
+               Error.adapter_error("pdftotext failed while extracting PDF", %{
+                 error_code: :unavailable,
+                 status: status,
+                 output: output
+               })}
+          end
+        end)
+    end
+  end
+
+  defp pdftotext_path do
+    config(:pdftotext_path) || System.find_executable("pdftotext")
+  end
+
+  defp fetch_cached(url, opts) do
+    if opts[:cache] do
+      ensure_cache_table!()
+      now = System.system_time(:millisecond)
+
+      case :ets.lookup(@cache_table, cache_key(url, opts)) do
+        [{_key, expires_at, result}] ->
+          if expires_at > now do
+            {:ok, Map.put(result, :cached, true)}
+          else
+            :ets.delete(@cache_table, cache_key(url, opts))
+            :miss
+          end
+
+        [] ->
+          :miss
+      end
+    else
+      :miss
+    end
+  end
+
+  defp maybe_store_cache(url, opts, result) do
+    if opts[:cache] do
+      ensure_cache_table!()
+
+      expires_at = System.system_time(:millisecond) + max(opts[:cache_ttl_ms], 0)
+      :ets.insert(@cache_table, {cache_key(url, opts), expires_at, result})
+    end
+
+    :ok
+  end
+
+  defp ensure_cache_table! do
+    case :ets.whereis(@cache_table) do
+      :undefined ->
+        try do
+          :ets.new(@cache_table, [:named_table, :set, :public, read_concurrency: true, write_concurrency: true])
+        rescue
+          ArgumentError -> @cache_table
+        end
+
+      table ->
+        table
+    end
+  end
+
+  defp cache_key(url, opts) do
+    {:jido_browser_web_fetch, url, opts[:format], opts[:selector], opts[:allowed_domains], opts[:blocked_domains],
+     opts[:focus_terms], opts[:focus_window], opts[:max_content_tokens], opts[:citations]}
+  end
+
+  defp request_headers do
+    [
+      {"accept", "text/html,application/xhtml+xml,text/plain,application/pdf;q=0.9,*/*;q=0.1"},
+      {"user-agent", user_agent()}
+    ]
+  end
+
+  defp user_agent do
+    vsn =
+      case Application.spec(:jido_browser, :vsn) do
+        nil -> "dev"
+        value -> List.to_string(value)
+      end
+
+    "jido_browser/#{vsn}"
+  end
+
+  defp response_content_type(response) do
+    response
+    |> Req.Response.get_header("content-type")
+    |> List.first()
+    |> case do
+      nil -> infer_content_type(response.body)
+      content_type -> content_type |> String.split(";") |> hd() |> String.trim() |> String.downcase()
+    end
+  end
+
+  defp infer_content_type(body) when is_binary(body) do
+    if String.starts_with?(body, "%PDF-") do
+      "application/pdf"
+    else
+      "text/plain"
+    end
+  end
+
+  defp infer_content_type(_body), do: "application/octet-stream"
+
+  defp text_content_type?(content_type) do
+    content_type in @text_content_types or String.starts_with?(content_type, "text/")
+  end
+
+  defp retrieved_at do
+    DateTime.utc_now()
+    |> DateTime.truncate(:second)
+    |> DateTime.to_iso8601()
+  end
+
+  defp estimate_tokens(content) when is_binary(content) do
+    div(String.length(content) + 3, 4)
+  end
+
+  defp estimate_tokens(_content), do: 0
+
+  defp normalize_citations(%{enabled: enabled}), do: enabled == true
+  defp normalize_citations(enabled), do: enabled == true
+
+  defp present_domain_rules?(rules), do: rules not in [nil, []]
+
+  defp normalize_focus_terms(nil), do: []
+
+  defp normalize_focus_terms(terms) do
+    terms
+    |> List.wrap()
+    |> Enum.map(fn
+      term when is_binary(term) -> String.trim(term)
+      term -> to_string(term)
+    end)
+    |> Enum.reject(&(&1 == ""))
+    |> Enum.uniq()
+  end
+
+  defp normalize_known_url(url) when is_binary(url) do
+    url
+    |> String.trim()
+    |> case do
+      "" -> nil
+      value -> value
+    end
+  end
+
+  defp normalize_known_url(_), do: nil
+
+  defp normalize_uri(%URI{} = uri) do
+    %{uri | host: String.downcase(uri.host || ""), fragment: nil}
+  end
+
+  defp normalize_rule_path(nil), do: "/"
+  defp normalize_rule_path(""), do: "/"
+  defp normalize_rule_path(path), do: if(String.starts_with?(path, "/"), do: path, else: "/" <> path)
+
+  defp title_from_url(url) do
+    path = URI.parse(url).path || ""
+
+    case path do
+      "" -> nil
+      "/" -> nil
+      value -> value |> Path.basename() |> String.trim("/") |> blank_to_nil()
+    end
+  end
+
+  defp blank_to_nil(nil), do: nil
+  defp blank_to_nil(""), do: nil
+  defp blank_to_nil(value), do: value
+
+  defp ascii_only?(value) when is_binary(value) do
+    String.printable?(value) and String.match?(value, ~r/^[\x00-\x7F]+$/)
+  end
+
+  defp config(key, default \\ nil) do
+    :jido_browser
+    |> Application.get_env(:web_fetch, [])
+    |> Keyword.get(key, default)
+  end
+
+  defp with_tmp_files(prefix, first_suffix, second_suffix, fun) do
+    base = Path.join(System.tmp_dir!(), "#{prefix}_#{System.unique_integer([:positive])}")
+    first = base <> first_suffix
+    second = base <> second_suffix
+
+    try do
+      fun.(first, second)
+    after
+      File.rm(first)
+      File.rm(second)
+    end
+  end
+end
diff --git a/mix.exs b/mix.exs
index b7ce8ca..091d21a 100644
--- a/mix.exs
+++ b/mix.exs
@@ -69,6 +69,7 @@ defmodule Jido.Browser.MixProject do
       {:req, "~> 0.5"},
       {:jason, "~> 1.4"},
       {:uniq, "~> 0.6"},
+      {:floki, "~> 0.38"},
       {:html2markdown, "~> 0.3"},
 
       # Dev/Test
@@ -111,7 +112,8 @@ defmodule Jido.Browser.MixProject do
         Core: [
           Jido.Browser,
           Jido.Browser.Session,
-          Jido.Browser.Plugin
+          Jido.Browser.Plugin,
+          Jido.Browser.WebFetch
         ],
         Adapters: [
           Jido.Browser.Adapter,
@@ -154,7 +156,8 @@ defmodule Jido.Browser.MixProject do
         "Content Extraction": [
           Jido.Browser.Actions.Snapshot,
           Jido.Browser.Actions.Screenshot,
-          Jido.Browser.Actions.ExtractContent
+          Jido.Browser.Actions.ExtractContent,
+          Jido.Browser.Actions.WebFetch
         ],
         Advanced: [
           Jido.Browser.Actions.Evaluate
diff --git a/test/jido_browser/composite_actions_and_installer_test.exs b/test/jido_browser/composite_actions_and_installer_test.exs
index dddf9d5..52b4633 100644
--- a/test/jido_browser/composite_actions_and_installer_test.exs
+++ b/test/jido_browser/composite_actions_and_installer_test.exs
@@ -5,6 +5,7 @@ defmodule Jido.Browser.CompositeActionsAndInstallerTest do
   alias Jido.Browser.Actions.ReadPage
   alias Jido.Browser.Actions.SearchWeb
   alias Jido.Browser.Actions.SnapshotUrl
+  alias Jido.Browser.Actions.WebFetch
   alias Jido.Browser.Installer
   alias Jido.Browser.Session
 
@@ -176,6 +177,64 @@ defmodule Jido.Browser.CompositeActionsAndInstallerTest do
     end
   end
 
+  describe "WebFetch.run/2" do
+    test "passes provenance options through to the fetch API" do
+      expect(Jido.Browser, :web_fetch, fn "https://example.com/guide", opts ->
+        assert opts[:require_known_url] == true
+        assert "https://example.com/guide" in opts[:known_urls]
+        assert opts[:allowed_domains] == ["example.com"]
+
+        {:ok,
+         %{
+           url: "https://example.com/guide",
+           final_url: "https://example.com/guide",
+           title: "Guide",
+           content: "Fetched guide content",
+           format: :markdown,
+           content_type: "text/html",
+           document_type: :html,
+           retrieved_at: "2026-03-21T00:00:00Z",
+           estimated_tokens: 5,
+           original_estimated_tokens: 5,
+           truncated: false,
+           filtered: false,
+           focus_matches: 0,
+           cached: false,
+           citations: %{enabled: false},
+           passages: []
+         }}
+      end)
+
+      context = %{skill_state: %{seen_urls: ["https://example.com/guide"], web_fetch_uses: 0}}
+
+      assert {:ok, result} =
+               WebFetch.run(
+                 %{
+                   url: "https://example.com/guide",
+                   require_known_url: true,
+                   allowed_domains: ["example.com"]
+                 },
+                 context
+               )
+
+      assert result.status == "success"
+      assert result.url == "https://example.com/guide"
+    end
+
+    test "returns max_uses_exceeded before calling the fetch API" do
+      context = %{skill_state: %{web_fetch_uses: 2}}
+
+      assert {:error, error} =
+               WebFetch.run(
+                 %{url: "https://example.com/guide", max_uses: 2},
+                 context
+               )
+
+      assert %Jido.Browser.Error.InvalidError{} = error
+      assert error.details.error_code == :max_uses_exceeded
+    end
+  end
+
   describe "Installer" do
     test "target returns a supported platform atom" do
       assert Installer.target() in [
diff --git a/test/jido_browser/plugin_test.exs b/test/jido_browser/plugin_test.exs
index 8636afd..ef16ef9 100644
--- a/test/jido_browser/plugin_test.exs
+++ b/test/jido_browser/plugin_test.exs
@@ -23,9 +23,9 @@ defmodule Jido.Browser.PluginTest do
       assert "automation" in tags
     end
 
-    test "has 37 actions" do
+    test "has 38 actions" do
       actions = Plugin.actions()
-      assert length(actions) == 37
+      assert length(actions) == 38
     end
 
     test "includes all expected action modules" do
@@ -73,13 +73,14 @@ defmodule Jido.Browser.PluginTest do
 
       # Advanced
       assert Jido.Browser.Actions.Evaluate in actions
+      assert Jido.Browser.Actions.WebFetch in actions
     end
   end
 
   describe "signal_routes/1" do
-    test "returns 37 routes" do
+    test "returns 38 routes" do
       routes = Plugin.signal_routes(%{})
-      assert length(routes) == 37
+      assert length(routes) == 38
     end
 
     test "maps browser.navigate to Navigate action" do
@@ -122,6 +123,8 @@ defmodule Jido.Browser.PluginTest do
       assert state.adapter == Jido.Browser.Adapters.AgentBrowser
       assert state.last_url == nil
       assert state.last_title == nil
+      assert state.seen_urls == []
+      assert state.web_fetch_uses == 0
     end
 
     test "accepts headless config override" do
@@ -154,7 +157,7 @@ defmodule Jido.Browser.PluginTest do
     test "returns list of signal patterns" do
       patterns = Plugin.signal_patterns()
       assert is_list(patterns)
-      assert length(patterns) == 37
+      assert length(patterns) == 38
     end
 
     test "all patterns have browser. prefix" do
@@ -173,6 +176,7 @@ defmodule Jido.Browser.PluginTest do
       assert "browser.save_state" in patterns
       assert "browser.tab_list" in patterns
       assert "browser.console" in patterns
+      assert "browser.web_fetch" in patterns
     end
   end
 
@@ -188,6 +192,36 @@ defmodule Jido.Browser.PluginTest do
       assert Plugin.transform_result(:some_action, result, %{}) == result
     end
 
+    test "tracks discovered URLs and fetch usage for web fetch results" do
+      context = %{skill_state: %{seen_urls: ["https://seed.example"], web_fetch_uses: 1}}
+
+      result =
+        Plugin.transform_result(
+          Jido.Browser.Actions.WebFetch,
+          {:ok, %{url: "https://example.com", final_url: "https://example.com/final", status: "success"}},
+          context
+        )
+
+      assert {:ok, _result, state_updates} = result
+
+      assert Enum.sort(state_updates.seen_urls) ==
+               Enum.sort(["https://seed.example", "https://example.com", "https://example.com/final"])
+
+      assert state_updates.web_fetch_uses == 2
+    end
+
+    test "tracks URLs returned by search results" do
+      result =
+        Plugin.transform_result(
+          Jido.Browser.Actions.SearchWeb,
+          {:ok, %{results: [%{url: "https://elixir-lang.org"}]}},
+          %{skill_state: %{}}
+        )
+
+      assert {:ok, _result, state_updates} = result
+      assert state_updates.seen_urls == ["https://elixir-lang.org"]
+    end
+
     test "enhances error results when session available" do
       context = %{
         skill_state: %{
diff --git a/test/jido_browser/web_fetch_test.exs b/test/jido_browser/web_fetch_test.exs
new file mode 100644
index 0000000..c871bd0
--- /dev/null
+++ b/test/jido_browser/web_fetch_test.exs
@@ -0,0 +1,166 @@
+defmodule Jido.Browser.WebFetchTest do
+  use ExUnit.Case, async: false
+  use Mimic
+
+  alias Jido.Browser.Error
+  alias Jido.Browser.WebFetch
+
+  setup :set_mimic_global
+
+  setup_all do
+    Mimic.copy(Req)
+    :ok
+  end
+
+  setup do
+    WebFetch.clear_cache()
+    :ok
+  end
+
+  describe "web_fetch/2" do
+    test "fetches HTML content with selector extraction and citation passages" do
+      expect(Req, :run, fn opts ->
+        assert opts[:url] == "https://example.com/article"
+
+        request = Req.Request.new(url: "https://example.com/article")
+
+        response =
+          %Req.Response{
+            status: 200,
+            headers: %{"content-type" => ["text/html; charset=utf-8"]},
+            body: """
+            <html>
+              <head><title>Example Article</title></head>
+              <body>
+                <nav>Ignore me</nav>
+                <main>
+                  <h1>Hello</h1>
+                  <p>Alpha paragraph.</p>
+                  <p>Beta paragraph.</p>
+                </main>
+              </body>
+            </html>
+            """
+          }
+
+        {request, response}
+      end)
+
+      assert {:ok, result} =
+               Jido.Browser.web_fetch(
+                 "https://example.com/article",
+                 selector: "main",
+                 format: :markdown,
+                 citations: true
+               )
+
+      assert result.title == "Example Article"
+      assert result.document_type == :html
+      assert result.format == :markdown
+      assert result.content =~ "Hello"
+      assert result.content =~ "Alpha paragraph."
+      assert result.cached == false
+      assert result.citations.enabled == true
+      assert [%{start_char: 0, text: passage_text} | _] = result.passages
+      assert passage_text =~ "Hello"
+    end
+
+    test "applies focused filtering to plain text responses" do
+      expect(Req, :run, fn opts ->
+        request = Req.Request.new(url: opts[:url])
+
+        response =
+          %Req.Response{
+            status: 200,
+            headers: %{"content-type" => ["text/plain"]},
+            body: """
+            Intro section
+
+            The relevant paragraph mentions Elixir and OTP.
+
+            Closing section
+            """
+          }
+
+        {request, response}
+      end)
+
+      assert {:ok, result} =
+               Jido.Browser.web_fetch(
+                 "https://example.com/notes.txt",
+                 format: :text,
+                 focus_terms: ["elixir"]
+               )
+
+      assert result.filtered == true
+      assert result.focus_matches == 1
+      assert result.content =~ "relevant paragraph"
+      refute result.content =~ "Intro section"
+    end
+
+    test "rejects URLs outside allowed_domains" do
+      assert {:error, %Error.InvalidError{details: %{error_code: :url_not_allowed}}} =
+               Jido.Browser.web_fetch(
+                 "https://example.com/private",
+                 allowed_domains: ["docs.example.com"]
+               )
+    end
+
+    test "enforces known URL provenance when requested" do
+      assert {:error, %Error.InvalidError{details: %{error_code: :url_not_allowed}}} =
+               Jido.Browser.web_fetch(
+                 "https://example.com/private",
+                 require_known_url: true,
+                 known_urls: ["https://example.com/public"]
+               )
+    end
+
+    test "caps returned content by approximate token budget" do
+      expect(Req, :run, fn opts ->
+        request = Req.Request.new(url: opts[:url])
+
+        response =
+          %Req.Response{
+            status: 200,
+            headers: %{"content-type" => ["text/plain"]},
+            body: String.duplicate("abcdef", 20)
+          }
+
+        {request, response}
+      end)
+
+      assert {:ok, result} =
+               Jido.Browser.web_fetch(
+                 "https://example.com/large.txt",
+                 format: :text,
+                 max_content_tokens: 5
+               )
+
+      assert result.truncated == true
+      assert result.original_estimated_tokens > 5
+      assert result.estimated_tokens <= 5
+    end
+
+    test "reuses cached responses for identical requests" do
+      expect(Req, :run, fn opts ->
+        request = Req.Request.new(url: opts[:url])
+
+        response =
+          %Req.Response{
+            status: 200,
+            headers: %{"content-type" => ["text/plain"]},
+            body: "cached content"
+          }
+
+        {request, response}
+      end)
+
+      assert {:ok, first} = Jido.Browser.web_fetch("https://example.com/cache.txt", format: :text)
+      assert {:ok, second} = Jido.Browser.web_fetch("https://example.com/cache.txt", format: :text)
+
+      assert first.cached == false
+      assert second.cached == true
+      assert first.content == second.content
+    end
+  end
+end

From 48c8632cab7000f0f080538da2553fce86d6ce3f Mon Sep 17 00:00:00 2001
From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com>
Date: Sat, 21 Mar 2026 20:00:41 -0500
Subject: [PATCH 2/7] chore: remove changelog entry from web fetch PR

---
 CHANGELOG.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 99820c3..115277a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,10 +7,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-### Added
-
-- Add HTTP-first `Jido.Browser.web_fetch/2` and `Jido.Browser.Actions.WebFetch` for stateless page retrieval with domain policy, focused filtering, caching, and citation-ready passages
-
 ### Changed
 
 - Rename the public Elixir namespace from `JidoBrowser.*` to `Jido.Browser.*`

From f57bdd25538d4f0e2cbcfbd3db8d22284606d5b0 Mon Sep 17 00:00:00 2001
From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com>
Date: Sat, 21 Mar 2026 20:01:51 -0500
Subject: [PATCH 3/7] chore: drop changelog newline diff

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 115277a..fb19c91 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -110,4 +110,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Refactoring:
 
-* streamline agent-browser runtime defaults by mikehostetler
+* streamline agent-browser runtime defaults by mikehostetler
\ No newline at end of file

From 5c674e69e2e543ea0ee2067c695a2e3badbd00ca Mon Sep 17 00:00:00 2001
From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com>
Date: Sat, 21 Mar 2026 20:15:02 -0500
Subject: [PATCH 4/7] feat: use extractous for web fetch documents

---
 README.md                             |   7 +-
 lib/jido_browser/actions/web_fetch.ex |   8 +-
 lib/jido_browser/web_fetch.ex         | 378 +++++++++++++++++++-------
 mix.exs                               |   1 +
 mix.lock                              |   4 +
 test/jido_browser/web_fetch_test.exs  |  97 +++++++
 6 files changed, 387 insertions(+), 108 deletions(-)

diff --git a/README.md b/README.md
index c1d1808..590898a 100644
--- a/README.md
+++ b/README.md
@@ -94,6 +94,8 @@ result.content
 result.passages
 ```
 
+`web_fetch/2` keeps HTML handling native for selector extraction and markdown conversion, and uses `extractous_ex` for fetched binary documents such as PDFs, Word, Excel, PowerPoint, OpenDocument, EPUB, and common email formats.
+
 ### State Persistence
 
 ```elixir
@@ -164,7 +166,10 @@ Optional web fetch settings:
 ```elixir
 config :jido_browser, :web_fetch,
   cache_ttl_ms: 300_000,
-  pdftotext_path: "/usr/local/bin/pdftotext"
+  extractous: [
+    pdf: [extract_annotation_text: true],
+    office: [include_headers_and_footers: true]
+  ]
 ```
 
 ## Backends
diff --git a/lib/jido_browser/actions/web_fetch.ex b/lib/jido_browser/actions/web_fetch.ex
index 417f07d..7efcaae 100644
--- a/lib/jido_browser/actions/web_fetch.ex
+++ b/lib/jido_browser/actions/web_fetch.ex
@@ -1,17 +1,17 @@
 defmodule Jido.Browser.Actions.WebFetch do
   @moduledoc """
-  Stateless HTTP-first page retrieval for agent workflows.
+  Stateless HTTP-first document retrieval for agent workflows.
 
   `WebFetch` is a lighter-weight alternative to browser navigation when the
   target content can be retrieved over plain HTTP(S) without JavaScript
-  execution.
+  execution, including fetched PDFs and office-style documents.
   """
 
   use Jido.Action,
     name: "web_fetch",
     description:
-      "Fetch a URL over HTTP(S) with domain policy controls, optional focused filtering, " <>
-        "approximate token caps, and citation-ready passages.",
+      "Fetch a URL over HTTP(S) with domain policy controls, Extractous-backed document extraction, " <>
+        "optional focused filtering, approximate token caps, and citation-ready passages.",
     category: "Browser",
     tags: ["browser", "web", "fetch", "http", "retrieval"],
     vsn: "2.0.0",
diff --git a/lib/jido_browser/web_fetch.ex b/lib/jido_browser/web_fetch.ex
index 8599598..d94b51d 100644
--- a/lib/jido_browser/web_fetch.ex
+++ b/lib/jido_browser/web_fetch.ex
@@ -1,7 +1,8 @@
 defmodule Jido.Browser.WebFetch do
   @moduledoc """
   Stateless HTTP-first web retrieval with optional domain policy, caching,
-  focused filtering, and citation-ready passage metadata.
+  focused filtering, citation-ready passage metadata, and Extractous-backed
+  document extraction.
 
   This module is intended for document retrieval workloads where starting a full
   browser session would be unnecessary or too expensive.
@@ -16,8 +17,64 @@ defmodule Jido.Browser.WebFetch do
   @default_max_url_length 2_048
   @supported_formats [:markdown, :text, :html]
   @html_content_types ["text/html", "application/xhtml+xml"]
-  @text_content_types ["text/plain", "text/markdown", "text/csv", "text/xml", "application/xml"]
-  @pdf_content_types ["application/pdf"]
+  @text_content_types [
+    "text/plain",
+    "text/markdown",
+    "text/csv",
+    "text/xml",
+    "application/xml",
+    "application/json",
+    "application/ld+json"
+  ]
+  @document_content_types %{
+    "application/pdf" => :pdf,
+    "application/msword" => :word_processing,
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => :word_processing,
+    "application/vnd.ms-word.document.macroenabled.12" => :word_processing,
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.template" => :word_processing,
+    "application/vnd.ms-word.template.macroenabled.12" => :word_processing,
+    "application/vnd.ms-excel" => :spreadsheet,
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => :spreadsheet,
+    "application/vnd.ms-excel.sheet.macroenabled.12" => :spreadsheet,
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.template" => :spreadsheet,
+    "application/vnd.ms-excel.template.macroenabled.12" => :spreadsheet,
+    "application/vnd.ms-powerpoint" => :presentation,
+    "application/vnd.openxmlformats-officedocument.presentationml.presentation" => :presentation,
+    "application/vnd.ms-powerpoint.presentation.macroenabled.12" => :presentation,
+    "application/vnd.openxmlformats-officedocument.presentationml.slideshow" => :presentation,
+    "application/vnd.openxmlformats-officedocument.presentationml.template" => :presentation,
+    "application/vnd.oasis.opendocument.text" => :word_processing,
+    "application/vnd.oasis.opendocument.spreadsheet" => :spreadsheet,
+    "application/vnd.oasis.opendocument.presentation" => :presentation,
+    "application/rtf" => :word_processing,
+    "text/rtf" => :word_processing,
+    "application/epub+zip" => :ebook,
+    "message/rfc822" => :email,
+    "application/vnd.ms-outlook" => :email
+  }
+  @document_extensions %{
+    "pdf" => :pdf,
+    "doc" => :word_processing,
+    "docx" => :word_processing,
+    "docm" => :word_processing,
+    "dotx" => :word_processing,
+    "dotm" => :word_processing,
+    "odt" => :word_processing,
+    "rtf" => :word_processing,
+    "xls" => :spreadsheet,
+    "xlsx" => :spreadsheet,
+    "xlsm" => :spreadsheet,
+    "xlsb" => :spreadsheet,
+    "ods" => :spreadsheet,
+    "ppt" => :presentation,
+    "pptx" => :presentation,
+    "pptm" => :presentation,
+    "ppsx" => :presentation,
+    "odp" => :presentation,
+    "epub" => :ebook,
+    "eml" => :email,
+    "msg" => :email
+  }
 
   @type result :: %{
           required(:url) => String.t(),
@@ -35,7 +92,8 @@ defmodule Jido.Browser.WebFetch do
           required(:cached) => boolean(),
           required(:citations) => %{enabled: boolean()},
           required(:passages) => list(map()),
-          optional(:title) => String.t() | nil
+          optional(:title) => String.t() | nil,
+          optional(:metadata) => map()
         }
 
   @doc """
@@ -53,6 +111,7 @@ defmodule Jido.Browser.WebFetch do
   - `:cache` - enable ETS cache, defaults to `true`
   - `:cache_ttl_ms` - cache TTL in milliseconds
   - `:require_known_url` / `:known_urls` - optional URL provenance guard
+  - `:extractous` - optional `ExtractousEx` keyword options merged with config
   """
   @spec fetch(String.t(), keyword()) :: {:ok, result()} | {:error, Exception.t()}
   def fetch(url, opts \\ [])
@@ -125,13 +184,14 @@ defmodule Jido.Browser.WebFetch do
 
   defp build_result(url, final_url, response, opts) do
     content_type = response_content_type(response)
+    document_type = extractable_document_type(content_type, final_url, response.body)
 
     cond do
       content_type in @html_content_types ->
         build_html_result(url, final_url, response.body, content_type, opts)
 
-      content_type in @pdf_content_types ->
-        build_pdf_result(url, final_url, response.body, content_type, opts)
+      not is_nil(document_type) ->
+        build_document_result(url, final_url, response.body, content_type, document_type, opts)
 
       text_content_type?(content_type) ->
         build_text_result(url, final_url, response.body, content_type, opts)
@@ -221,11 +281,11 @@ defmodule Jido.Browser.WebFetch do
      })}
   end
 
-  defp build_pdf_result(url, final_url, body, content_type, opts) when is_binary(body) do
+  defp build_document_result(url, final_url, body, content_type, document_type, opts) when is_binary(body) do
     cond do
       opts[:selector] ->
         {:error,
-         Error.invalid_error("Selector filtering is not supported for PDF content", %{
+         Error.invalid_error("Selector filtering is only supported for HTML content", %{
            error_code: :invalid_input,
            selector: opts[:selector],
            content_type: content_type
@@ -233,14 +293,14 @@ defmodule Jido.Browser.WebFetch do
 
       opts[:format] == :html ->
         {:error,
-         Error.invalid_error("HTML output is not supported for PDF content", %{
+         Error.invalid_error("HTML output is only supported for HTML content", %{
            error_code: :invalid_input,
            format: :html,
            content_type: content_type
          })}
 
       true ->
-        with {:ok, text} <- extract_pdf_text(body),
+        with {:ok, text, metadata} <- extract_document_content(body, final_url, content_type, document_type, opts),
              {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(text, opts),
              {final_content, truncated, original_estimated_tokens} <-
                maybe_truncate(filtered_content, opts[:max_content_tokens]) do
@@ -249,22 +309,23 @@ defmodule Jido.Browser.WebFetch do
              url,
              final_url,
              final_content,
-             title_from_url(final_url),
+             document_title(metadata, final_url),
              content_type,
-             :pdf,
+             document_type,
              opts,
              truncated,
              filtered,
              focus_matches,
-             original_estimated_tokens
+             original_estimated_tokens,
+             metadata
            )}
         end
     end
   end
 
-  defp build_pdf_result(_url, _final_url, body, content_type, _opts) do
+  defp build_document_result(_url, _final_url, body, content_type, _document_type, _opts) do
     {:error,
-     Error.adapter_error("Unexpected response body for PDF fetch", %{
+     Error.adapter_error("Unexpected response body for document fetch", %{
        error_code: :unavailable,
        content_type: content_type,
        body: body
@@ -282,7 +343,8 @@ defmodule Jido.Browser.WebFetch do
          truncated,
          filtered,
          focus_matches,
-         original_estimated_tokens
+         original_estimated_tokens,
+         metadata \\ nil
        ) do
     passages = maybe_build_passages(content, title, final_url, opts[:citations])
 
@@ -304,6 +366,7 @@ defmodule Jido.Browser.WebFetch do
       citations: %{enabled: opts[:citations]},
       passages: passages
     }
+    |> maybe_put_metadata(metadata)
   end
 
   defp normalize_opts(opts) do
@@ -311,42 +374,46 @@ defmodule Jido.Browser.WebFetch do
     citations = normalize_citations(opts[:citations])
     focus_terms = normalize_focus_terms(opts[:focus_terms])
 
-    cond do
-      format not in @supported_formats ->
-        {:error,
-         Error.invalid_error("Unsupported web fetch format", %{
-           error_code: :invalid_input,
-           format: format,
-           supported_formats: @supported_formats
-         })}
+    with {:ok, configured_extractous_opts} <- normalize_extractous_opts(config(:extractous, [])),
+         {:ok, request_extractous_opts} <- normalize_extractous_opts(Keyword.get(opts, :extractous, [])) do
+      cond do
+        format not in @supported_formats ->
+          {:error,
+           Error.invalid_error("Unsupported web fetch format", %{
+             error_code: :invalid_input,
+             format: format,
+             supported_formats: @supported_formats
+           })}
 
-      present_domain_rules?(opts[:allowed_domains]) and present_domain_rules?(opts[:blocked_domains]) ->
-        {:error,
-         Error.invalid_error("Use either allowed_domains or blocked_domains, not both", %{
-           error_code: :invalid_input
-         })}
+        present_domain_rules?(opts[:allowed_domains]) and present_domain_rules?(opts[:blocked_domains]) ->
+          {:error,
+           Error.invalid_error("Use either allowed_domains or blocked_domains, not both", %{
+             error_code: :invalid_input
+           })}
 
-      format == :html and focus_terms != [] ->
-        {:error,
-         Error.invalid_error("Focused filtering is only supported for markdown and text output", %{
-           error_code: :invalid_input,
-           format: format
-         })}
+        format == :html and focus_terms != [] ->
+          {:error,
+           Error.invalid_error("Focused filtering is only supported for markdown and text output", %{
+             error_code: :invalid_input,
+             format: format
+           })}
 
-      true ->
-        normalized =
-          opts
-          |> Keyword.put(:format, format)
-          |> Keyword.put(:citations, citations)
-          |> Keyword.put(:focus_terms, focus_terms)
-          |> Keyword.put_new(:focus_window, 0)
-          |> Keyword.put_new(:timeout, config(:timeout, @default_timeout))
-          |> Keyword.put_new(:max_redirects, @default_max_redirects)
-          |> Keyword.put_new(:cache, true)
-          |> Keyword.put_new(:cache_ttl_ms, config(:cache_ttl_ms, @default_cache_ttl_ms))
-          |> Keyword.put_new(:known_urls, [])
-
-        {:ok, normalized}
+        true ->
+          normalized =
+            opts
+            |> Keyword.put(:format, format)
+            |> Keyword.put(:citations, citations)
+            |> Keyword.put(:focus_terms, focus_terms)
+            |> Keyword.put(:extractous, merge_extractous_opts(configured_extractous_opts, request_extractous_opts))
+            |> Keyword.put_new(:focus_window, 0)
+            |> Keyword.put_new(:timeout, config(:timeout, @default_timeout))
+            |> Keyword.put_new(:max_redirects, @default_max_redirects)
+            |> Keyword.put_new(:cache, true)
+            |> Keyword.put_new(:cache_ttl_ms, config(:cache_ttl_ms, @default_cache_ttl_ms))
+            |> Keyword.put_new(:known_urls, [])
+
+          {:ok, normalized}
+      end
     end
   end
 
@@ -694,44 +761,44 @@ defmodule Jido.Browser.WebFetch do
     end
   end
 
-  defp extract_pdf_text(bytes) do
-    case pdftotext_path() do
-      nil ->
+  defp extract_document_content(bytes, final_url, content_type, document_type, opts) do
+    case ExtractousEx.extract_from_bytes(bytes, opts[:extractous]) do
+      {:ok, %{content: content, metadata: metadata}} when is_binary(content) ->
+        {:ok, String.trim(content), normalize_metadata(metadata)}
+
+      {:ok, %{content: content}} when is_binary(content) ->
+        {:ok, String.trim(content), %{}}
+
+      {:ok, result} ->
         {:error,
-         Error.adapter_error("PDF extraction requires pdftotext to be installed", %{
-           error_code: :unsupported_content_type,
-           content_type: "application/pdf"
+         Error.adapter_error("ExtractousEx returned an unexpected document payload", %{
+           error_code: :unavailable,
+           url: final_url,
+           content_type: content_type,
+           document_type: document_type,
+           result: result
          })}
 
-      binary ->
-        with_tmp_files("jido_browser_web_fetch", ".pdf", ".txt", fn pdf_path, txt_path ->
-          File.write!(pdf_path, bytes)
-
-          case System.cmd(binary, ["-layout", "-nopgbrk", pdf_path, txt_path], stderr_to_stdout: true) do
-            {_output, 0} ->
-              case File.read(txt_path) do
-                {:ok, text} ->
-                  {:ok, String.trim(text)}
-
-                {:error, reason} ->
-                  {:error,
-                   Error.adapter_error("Failed to read extracted PDF text", %{error_code: :unavailable, reason: reason})}
-              end
-
-            {output, status} ->
-              {:error,
-               Error.adapter_error("pdftotext failed while extracting PDF", %{
-                 error_code: :unavailable,
-                 status: status,
-                 output: output
-               })}
-          end
-        end)
+      {:error, reason} ->
+        {:error,
+         Error.adapter_error("ExtractousEx failed while extracting document content", %{
+           error_code: :unavailable,
+           url: final_url,
+           content_type: content_type,
+           document_type: document_type,
+           reason: reason
+         })}
     end
-  end
-
-  defp pdftotext_path do
-    config(:pdftotext_path) || System.find_executable("pdftotext")
+  rescue
+    error ->
+      {:error,
+       Error.adapter_error("ExtractousEx failed while extracting document content", %{
+         error_code: :unavailable,
+         url: final_url,
+         content_type: content_type,
+         document_type: document_type,
+         reason: error
+       })}
   end
 
   defp fetch_cached(url, opts) do
@@ -783,12 +850,16 @@ defmodule Jido.Browser.WebFetch do
 
   defp cache_key(url, opts) do
     {:jido_browser_web_fetch, url, opts[:format], opts[:selector], opts[:allowed_domains], opts[:blocked_domains],
-     opts[:focus_terms], opts[:focus_window], opts[:max_content_tokens], opts[:citations]}
+     opts[:focus_terms], opts[:focus_window], opts[:max_content_tokens], opts[:citations], opts[:extractous]}
   end
 
   defp request_headers do
     [
-      {"accept", "text/html,application/xhtml+xml,text/plain,application/pdf;q=0.9,*/*;q=0.1"},
+      {"accept",
+       "text/html,application/xhtml+xml,text/plain,application/json,application/pdf," <>
+         "application/msword,application/vnd.openxmlformats-officedocument.wordprocessingml.document," <>
+         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet," <>
+         "application/vnd.openxmlformats-officedocument.presentationml.presentation,*/*;q=0.1"},
       {"user-agent", user_agent()}
     ]
   end
@@ -814,10 +885,15 @@ defmodule Jido.Browser.WebFetch do
   end
 
   defp infer_content_type(body) when is_binary(body) do
-    if String.starts_with?(body, "%PDF-") do
-      "application/pdf"
-    else
-      "text/plain"
+    cond do
+      String.starts_with?(body, "%PDF-") ->
+        "application/pdf"
+
+      likely_text?(body) ->
+        "text/plain"
+
+      true ->
+        "application/octet-stream"
     end
   end
 
@@ -857,6 +933,38 @@ defmodule Jido.Browser.WebFetch do
     |> Enum.uniq()
   end
 
+  defp normalize_extractous_opts(nil), do: {:ok, []}
+
+  defp normalize_extractous_opts(opts) when is_list(opts) do
+    if Keyword.keyword?(opts) do
+      {:ok, opts}
+    else
+      {:error,
+       Error.invalid_error("Extractous options must be a keyword list", %{
+         error_code: :invalid_input,
+         extractous: opts
+       })}
+    end
+  end
+
+  defp normalize_extractous_opts(opts) do
+    {:error,
+     Error.invalid_error("Extractous options must be a keyword list", %{
+       error_code: :invalid_input,
+       extractous: opts
+     })}
+  end
+
+  defp merge_extractous_opts(left, right) do
+    Keyword.merge(left, right, fn _key, left_value, right_value ->
+      if Keyword.keyword?(left_value) and Keyword.keyword?(right_value) do
+        merge_extractous_opts(left_value, right_value)
+      else
+        right_value
+      end
+    end)
+  end
+
   defp normalize_known_url(url) when is_binary(url) do
     url
     |> String.trim()
@@ -876,6 +984,68 @@ defmodule Jido.Browser.WebFetch do
   defp normalize_rule_path(""), do: "/"
   defp normalize_rule_path(path), do: if(String.starts_with?(path, "/"), do: path, else: "/" <> path)
 
+  defp extractable_document_type(content_type, final_url, body) do
+    Map.get(@document_content_types, content_type) ||
+      infer_document_type_from_body(body) ||
+      if(ambiguous_binary_content_type?(content_type), do: infer_document_type_from_url(final_url), else: nil)
+  end
+
+  defp infer_document_type_from_url(url) do
+    url
+    |> URI.parse()
+    |> Map.get(:path, "")
+    |> Path.extname()
+    |> String.trim_leading(".")
+    |> String.downcase()
+    |> case do
+      "" -> nil
+      extension -> Map.get(@document_extensions, extension)
+    end
+  end
+
+  defp infer_document_type_from_body(body) when is_binary(body) do
+    if String.starts_with?(body, "%PDF-"), do: :pdf, else: nil
+  end
+
+  defp infer_document_type_from_body(_body), do: nil
+
+  defp document_title(metadata, url) do
+    metadata
+    |> metadata_title()
+    |> blank_to_nil()
+    |> case do
+      nil -> title_from_url(url)
+      title -> title
+    end
+  end
+
+  defp metadata_title(metadata) when is_map(metadata) do
+    Enum.find_value([:title, "title", "dc:title", :"dc:title"], fn key ->
+      metadata
+      |> Map.get(key)
+      |> metadata_value_to_string()
+      |> blank_to_nil()
+    end)
+  end
+
+  defp metadata_title(_metadata), do: nil
+
+  defp metadata_value_to_string(nil), do: nil
+  defp metadata_value_to_string(value) when is_binary(value), do: String.trim(value)
+
+  defp metadata_value_to_string(value) when is_list(value),
+    do: value |> Enum.map_join(" ", &to_string/1) |> String.trim()
+
+  defp metadata_value_to_string(value) when is_atom(value), do: value |> Atom.to_string() |> String.trim()
+  defp metadata_value_to_string(value) when is_number(value), do: value |> to_string() |> String.trim()
+  defp metadata_value_to_string(_value), do: nil
+
+  defp normalize_metadata(metadata) when is_map(metadata), do: metadata
+  defp normalize_metadata(_metadata), do: %{}
+
+  defp maybe_put_metadata(response, metadata) when metadata in [%{}, nil], do: response
+  defp maybe_put_metadata(response, metadata), do: Map.put(response, :metadata, metadata)
+
   defp title_from_url(url) do
     path = URI.parse(url).path || ""
 
@@ -894,22 +1064,24 @@ defmodule Jido.Browser.WebFetch do
     String.printable?(value) and String.match?(value, ~r/^[\x00-\x7F]+$/)
   end
 
-  defp config(key, default \\ nil) do
-    :jido_browser
-    |> Application.get_env(:web_fetch, [])
-    |> Keyword.get(key, default)
+  defp ambiguous_binary_content_type?(content_type) do
+    content_type in [
+      "application/octet-stream",
+      "binary/octet-stream",
+      "application/download",
+      "application/x-download",
+      "application/zip",
+      "application/x-zip-compressed"
+    ]
   end
 
-  defp with_tmp_files(prefix, first_suffix, second_suffix, fun) do
-    base = Path.join(System.tmp_dir!(), "#{prefix}_#{System.unique_integer([:positive])}")
-    first = base <> first_suffix
-    second = base <> second_suffix
+  defp likely_text?(body) when is_binary(body) do
+    String.valid?(body) and not String.contains?(body, <<0>>)
+  end
 
-    try do
-      fun.(first, second)
-    after
-      File.rm(first)
-      File.rm(second)
-    end
+  defp config(key, default) do
+    :jido_browser
+    |> Application.get_env(:web_fetch, [])
+    |> Keyword.get(key, default)
   end
 end
diff --git a/mix.exs b/mix.exs
index 091d21a..9fad805 100644
--- a/mix.exs
+++ b/mix.exs
@@ -71,6 +71,7 @@ defmodule Jido.Browser.MixProject do
       {:uniq, "~> 0.6"},
       {:floki, "~> 0.38"},
       {:html2markdown, "~> 0.3"},
+      {:extractous_ex, "~> 0.2"},
 
       # Dev/Test
       {:credo, "~> 1.7", only: [:dev, :test], runtime: false},
diff --git a/mix.lock b/mix.lock
index aaf36a7..e91947c 100644
--- a/mix.lock
+++ b/mix.lock
@@ -1,6 +1,7 @@
 %{
   "abacus": {:hex, :abacus, "2.1.0", "b6db5c989ba3d9dd8c36d1cb269e2f0058f34768d47c67eb8ce06697ecb36dd4", [:mix], [], "hexpm", "255de08b02884e8383f1eed8aa31df884ce0fb5eb394db81ff888089f2a1bbff"},
   "bunt": {:hex, :bunt, "1.0.0", "081c2c665f086849e6d57900292b3a161727ab40431219529f13c4ddcf3e7a44", [:mix], [], "hexpm", "dc5f86aa08a5f6fa6b8096f0735c4e76d54ae5c9fa2c143e5a1fc7c1cd9bb6b5"},
+  "castore": {:hex, :castore, "1.0.18", "5e43ef0ec7d31195dfa5a65a86e6131db999d074179d2ba5a8de11fe14570f55", [:mix], [], "hexpm", "f393e4fe6317829b158fb74d86eb681f737d2fe326aa61ccf6293c4104957e34"},
   "certifi": {:hex, :certifi, "2.15.0", "0e6e882fcdaaa0a5a9f2b3db55b1394dba07e8d6d9bcad08318fb604c6839712", [:rebar3], [], "hexpm", "b147ed22ce71d72eafdad94f055165c1c182f61a2ff49df28bcc71d1d5b94a60"},
   "credo": {:hex, :credo, "1.7.17", "f92b6aa5b26301eaa5a35e4d48ebf5aa1e7094ac00ae38f87086c562caf8a22f", [:mix], [{:bunt, "~> 0.2.1 or ~> 1.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2 or ~> 1.0", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "1eb5645c835f0b6c9b5410f94b5a185057bcf6d62a9c2b476da971cde8749645"},
   "crontab": {:hex, :crontab, "1.2.0", "503611820257939d5d0fd272eb2b454f48a470435a809479ddc2c40bb515495c", [:mix], [{:ecto, "~> 1.0 or ~> 2.0 or ~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm", "ebd7ef4d831e1b20fa4700f0de0284a04cac4347e813337978e25b4cc5cc2207"},
@@ -12,6 +13,7 @@
   "erlex": {:hex, :erlex, "0.2.8", "cd8116f20f3c0afe376d1e8d1f0ae2452337729f68be016ea544a72f767d9c12", [:mix], [], "hexpm", "9d66ff9fedf69e49dc3fd12831e12a8a37b76f8651dd21cd45fcf5561a8a7590"},
   "ex_doc": {:hex, :ex_doc, "0.40.1", "67542e4b6dde74811cfd580e2c0149b78010fd13001fda7cfeb2b2c2ffb1344d", [:mix], [{:earmark_parser, "~> 1.4.44", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "bcef0e2d360d93ac19f01a85d58f91752d930c0a30e2681145feea6bd3516e00"},
   "excoveralls": {:hex, :excoveralls, "0.18.5", "e229d0a65982613332ec30f07940038fe451a2e5b29bce2a5022165f0c9b157e", [:mix], [{:castore, "~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "523fe8a15603f86d64852aab2abe8ddbd78e68579c8525ae765facc5eae01562"},
+  "extractous_ex": {:hex, :extractous_ex, "0.2.1", "c9f7fd58b1d3b0d7eda9e219b1ed534a5b25e485884405d3ceee878e67248df2", [:mix], [{:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}, {:rustler, "~> 0.37", [hex: :rustler, repo: "hexpm", optional: false]}, {:rustler_precompiled, "~> 0.7", [hex: :rustler_precompiled, repo: "hexpm", optional: false]}], "hexpm", "8c1a3c74105448545a8478c3610fc920b2da418d47eae656853dc3e881adebd0"},
   "file_system": {:hex, :file_system, "1.1.1", "31864f4685b0148f25bd3fbef2b1228457c0c89024ad67f7a81a3ffbc0bbad3a", [:mix], [], "hexpm", "7a15ff97dfe526aeefb090a7a9d3d03aa907e100e262a0f8f7746b78f8f87a5d"},
   "finch": {:hex, :finch, "0.21.0", "b1c3b2d48af02d0c66d2a9ebfb5622be5c5ecd62937cf79a88a7f98d48a8290c", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "87dc6e169794cb2570f75841a19da99cfde834249568f2a5b121b809588a4377"},
   "floki": {:hex, :floki, "0.38.0", "62b642386fa3f2f90713f6e231da0fa3256e41ef1089f83b6ceac7a3fd3abf33", [:mix], [], "hexpm", "a5943ee91e93fb2d635b612caf5508e36d37548e84928463ef9dd986f0d1abd9"},
@@ -51,6 +53,8 @@
   "private": {:hex, :private, "0.1.2", "da4add9f36c3818a9f849840ca43016c8ae7f76d7a46c3b2510f42dcc5632932", [:mix], [], "hexpm", "22ee01c3f450cf8d135da61e10ec59dde006238fab1ea039014791fc8f3ff075"},
   "recase": {:hex, :recase, "0.8.1", "ab98cd35857a86fa5ca99036f575241d71d77d9c2ab0c39aacf1c9b61f6f7d1d", [:mix], [], "hexpm", "9fd8d63e7e43bd9ea385b12364e305778b2bbd92537e95c4b2e26fc507d5e4c2"},
   "req": {:hex, :req, "0.5.17", "0096ddd5b0ed6f576a03dde4b158a0c727215b15d2795e59e0916c6971066ede", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "0b8bc6ffdfebbc07968e59d3ff96d52f2202d0536f10fef4dc11dc02a2a43e39"},
+  "rustler": {:hex, :rustler, "0.37.3", "5f4e6634d43b26f0a69834dd1d3ed4e1710b022a053bf4a670220c9540c92602", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "a6872c6f53dcf00486d1e7f9e046e20e01bf1654bdacc4193016c2e8002b32a2"},
+  "rustler_precompiled": {:hex, :rustler_precompiled, "0.8.4", "700a878312acfac79fb6c572bb8b57f5aae05fe1cf70d34b5974850bbf2c05bf", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "3b33d99b540b15f142ba47944f7a163a25069f6d608783c321029bc1ffb09514"},
   "splode": {:hex, :splode, "0.3.0", "ff8effecc509a51245df2f864ec78d849248647c37a75886033e3b1a53ca9470", [:mix], [], "hexpm", "73cfd0892d7316d6f2c93e6e8784bd6e137b2aa38443de52fd0a25171d106d81"},
   "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"},
   "telemetry": {:hex, :telemetry, "1.4.1", "ab6de178e2b29b58e8256b92b382ea3f590a47152ca3651ea857a6cae05ac423", [:rebar3], [], "hexpm", "2172e05a27531d3d31dd9782841065c50dd5c3c7699d95266b2edd54c2dafa1c"},
diff --git a/test/jido_browser/web_fetch_test.exs b/test/jido_browser/web_fetch_test.exs
index c871bd0..43aad25 100644
--- a/test/jido_browser/web_fetch_test.exs
+++ b/test/jido_browser/web_fetch_test.exs
@@ -9,6 +9,7 @@ defmodule Jido.Browser.WebFetchTest do
 
   setup_all do
     Mimic.copy(Req)
+    Mimic.copy(ExtractousEx)
     :ok
   end
 
@@ -98,6 +99,102 @@ defmodule Jido.Browser.WebFetchTest do
       refute result.content =~ "Intro section"
     end
 
+    test "extracts PDF content through ExtractousEx and preserves metadata" do
+      pdf_bytes = "%PDF-1.7 fake"
+
+      expect(Req, :run, fn opts ->
+        request = Req.Request.new(url: opts[:url])
+
+        response =
+          %Req.Response{
+            status: 200,
+            headers: %{"content-type" => ["application/pdf"]},
+            body: pdf_bytes
+          }
+
+        {request, response}
+      end)
+
+      expect(ExtractousEx, :extract_from_bytes, fn ^pdf_bytes, opts ->
+        assert opts == []
+
+        {:ok,
+         %{
+           content: "Extracted PDF body",
+           metadata: %{"title" => "Quarterly Report", "author" => "Ops"}
+         }}
+      end)
+
+      assert {:ok, result} =
+               Jido.Browser.web_fetch(
+                 "https://example.com/reports/q1.pdf",
+                 format: :text,
+                 citations: true
+               )
+
+      assert result.title == "Quarterly Report"
+      assert result.document_type == :pdf
+      assert result.content_type == "application/pdf"
+      assert result.content == "Extracted PDF body"
+      assert result.metadata == %{"title" => "Quarterly Report", "author" => "Ops"}
+      assert result.citations.enabled == true
+      assert [%{text: "Extracted PDF body"}] = result.passages
+    end
+
+    test "extracts office documents served as octet-stream based on file extension" do
+      docx_bytes = <<80, 75, 3, 4, 20, 0, 0, 0>>
+
+      expect(Req, :run, fn opts ->
+        request = Req.Request.new(url: opts[:url])
+
+        response =
+          %Req.Response{
+            status: 200,
+            headers: %{"content-type" => ["application/octet-stream"]},
+            body: docx_bytes
+          }
+
+        {request, response}
+      end)
+
+      expect(ExtractousEx, :extract_from_bytes, fn ^docx_bytes, opts ->
+        assert opts == []
+        {:ok, %{content: "DOCX body", metadata: %{}}}
+      end)
+
+      assert {:ok, result} =
+               Jido.Browser.web_fetch("https://example.com/specs/design.docx", format: :markdown)
+
+      assert result.title == "design.docx"
+      assert result.document_type == :word_processing
+      assert result.content_type == "application/octet-stream"
+      assert result.content == "DOCX body"
+    end
+
+    test "returns an adapter error when ExtractousEx extraction fails" do
+      pdf_bytes = "%PDF-1.7 broken"
+
+      expect(Req, :run, fn opts ->
+        request = Req.Request.new(url: opts[:url])
+
+        response =
+          %Req.Response{
+            status: 200,
+            headers: %{"content-type" => ["application/pdf"]},
+            body: pdf_bytes
+          }
+
+        {request, response}
+      end)
+
+      expect(ExtractousEx, :extract_from_bytes, fn ^pdf_bytes, [] ->
+        {:error, "parse failed"}
+      end)
+
+      assert {:error, %Error.AdapterError{details: %{error_code: :unavailable, document_type: :pdf}}} =
+               Jido.Browser.web_fetch("https://example.com/broken.pdf", format: :text)
+    end
+
     test "rejects URLs outside allowed_domains" do
       assert {:error, %Error.InvalidError{details: %{error_code: :url_not_allowed}}} =
                Jido.Browser.web_fetch(

From 909aad680a0e5b0d63f159a0648ed2770f5b5de3 Mon Sep 17 00:00:00 2001
From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com>
Date: Sat, 21 Mar 2026 20:17:46 -0500
Subject: [PATCH 5/7] refactor: harden web fetch normalization

---
 README.md                            |   5 +-
 lib/jido_browser/web_fetch.ex        | 242 +++++++++++++++++----------
 test/jido_browser/web_fetch_test.exs |  30 ++++
 3 files changed, 185 insertions(+), 92 deletions(-)

diff --git a/README.md b/README.md
index 590898a..f6f78bd 100644
--- a/README.md
+++ b/README.md
@@ -92,9 +92,10 @@ Selectors remain supported, but ref-based interaction is the preferred 2.0 flow:
 
 result.content
 result.passages
+result.metadata
 ```
 
-`web_fetch/2` keeps HTML handling native for selector extraction and markdown conversion, and uses `extractous_ex` for fetched binary documents such as PDFs, Word, Excel, PowerPoint, OpenDocument, EPUB, and common email formats.
+`web_fetch/2` keeps HTML handling native for selector extraction and markdown conversion, and uses `extractous_ex` for fetched binary documents such as PDFs, Word, Excel, PowerPoint, OpenDocument, EPUB, and common email formats. Binary document responses may also include `result.metadata` when extraction returns document metadata.
 
 ### State Persistence
 
@@ -172,6 +173,8 @@ config :jido_browser, :web_fetch,
   ]
 ```
 
+Configured `extractous` options are merged with any per-call `extractous:` keyword options passed to `Jido.Browser.web_fetch/2`.
+
 ## Backends
 
 ### AgentBrowser (Default)
diff --git a/lib/jido_browser/web_fetch.ex b/lib/jido_browser/web_fetch.ex
index d94b51d..5c5cf81 100644
--- a/lib/jido_browser/web_fetch.ex
+++ b/lib/jido_browser/web_fetch.ex
@@ -153,6 +153,7 @@ defmodule Jido.Browser.WebFetch do
       url: url,
       headers: request_headers(),
       receive_timeout: opts[:timeout],
+      decode_body: false,
       redirect: true,
       max_redirects: opts[:max_redirects]
     ]
@@ -211,24 +212,8 @@ defmodule Jido.Browser.WebFetch do
     with {:ok, document} <- parse_document(body),
          {:ok, html} <- select_html(document, body, selector),
          {:ok, title} <- extract_title(document),
-         {:ok, content} <- format_html(html, opts[:format], opts),
-         {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts),
-         {final_content, truncated, original_estimated_tokens} <-
-           maybe_truncate(filtered_content, opts[:max_content_tokens]) do
-      {:ok,
-       build_response(
-         url,
-         final_url,
-         final_content,
-         title,
-         content_type,
-         :html,
-         opts,
-         truncated,
-         filtered,
-         focus_matches,
-         original_estimated_tokens
-       )}
+         {:ok, content} <- format_html(html, opts[:format], opts) do
+      finalize_result(url, final_url, content, title, content_type, :html, opts)
     end
   end
 
@@ -242,33 +227,9 @@ defmodule Jido.Browser.WebFetch do
   end
 
   defp build_text_result(url, final_url, body, content_type, opts) when is_binary(body) do
-    if opts[:selector] do
-      {:error,
-       Error.invalid_error("Selector filtering is only supported for HTML content", %{
-         error_code: :invalid_input,
-         selector: opts[:selector],
-         content_type: content_type
-       })}
-    else
-      with {:ok, content} <- format_text(body, opts[:format]),
-           {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts),
-           {final_content, truncated, original_estimated_tokens} <-
-             maybe_truncate(filtered_content, opts[:max_content_tokens]) do
-        {:ok,
-         build_response(
-           url,
-           final_url,
-           final_content,
-           nil,
-           content_type,
-           :text,
-           opts,
-           truncated,
-           filtered,
-           focus_matches,
-           original_estimated_tokens
-         )}
-      end
+    with :ok <- validate_non_html_options(content_type, opts),
+         {:ok, content} <- format_text(body, opts[:format]) do
+      finalize_result(url, final_url, content, nil, content_type, :text, opts)
     end
   end
 
@@ -282,44 +243,18 @@ defmodule Jido.Browser.WebFetch do
   end
 
   defp build_document_result(url, final_url, body, content_type, document_type, opts) when is_binary(body) do
-    cond do
-      opts[:selector] ->
-        {:error,
-         Error.invalid_error("Selector filtering is only supported for HTML content", %{
-           error_code: :invalid_input,
-           selector: opts[:selector],
-           content_type: content_type
-         })}
-
-      opts[:format] == :html ->
-        {:error,
-         Error.invalid_error("HTML output is only supported for HTML content", %{
-           error_code: :invalid_input,
-           format: :html,
-           content_type: content_type
-         })}
-
-      true ->
-        with {:ok, text, metadata} <- extract_document_content(body, final_url, content_type, document_type, opts),
-             {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(text, opts),
-             {final_content, truncated, original_estimated_tokens} <-
-               maybe_truncate(filtered_content, opts[:max_content_tokens]) do
-          {:ok,
-           build_response(
-             url,
-             final_url,
-             final_content,
-             document_title(metadata, final_url),
-             content_type,
-             document_type,
-             opts,
-             truncated,
-             filtered,
-             focus_matches,
-             original_estimated_tokens,
-             metadata
-           )}
-        end
+    with :ok <- validate_non_html_options(content_type, opts),
+         {:ok, text, metadata} <- extract_document_content(body, final_url, content_type, document_type, opts) do
+      finalize_result(
+        url,
+        final_url,
+        text,
+        document_title(metadata, final_url),
+        content_type,
+        document_type,
+        opts,
+        metadata
+      )
     end
   end
 
@@ -344,7 +279,7 @@ defmodule Jido.Browser.WebFetch do
          filtered,
          focus_matches,
          original_estimated_tokens,
-         metadata \\ nil
+         metadata
        ) do
     passages = maybe_build_passages(content, title, final_url, opts[:citations])
 
@@ -369,13 +304,76 @@ defmodule Jido.Browser.WebFetch do
     |> maybe_put_metadata(metadata)
   end
 
+  defp finalize_result(url, final_url, content, title, content_type, document_type, opts, metadata \\ nil) do
+    with {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts),
+         {final_content, truncated, original_estimated_tokens} <-
+           maybe_truncate(filtered_content, opts[:max_content_tokens]) do
+      {:ok,
+       build_response(
+         url,
+         final_url,
+         final_content,
+         title,
+         content_type,
+         document_type,
+         opts,
+         truncated,
+         filtered,
+         focus_matches,
+         original_estimated_tokens,
+         metadata
+       )}
+    end
+  end
+
+  defp validate_non_html_options(content_type, opts) do
+    cond do
+      opts[:selector] ->
+        {:error,
+         Error.invalid_error("Selector filtering is only supported for HTML content", %{
+           error_code: :invalid_input,
+           selector: opts[:selector],
+           content_type: content_type
+         })}
+
+      opts[:format] == :html ->
+        {:error,
+         Error.invalid_error("HTML output is only supported for HTML content", %{
+           error_code: :invalid_input,
+           format: :html,
+           content_type: content_type
+         })}
+
+      true ->
+        :ok
+    end
+  end
+
   defp normalize_opts(opts) do
     format = opts[:format] || :markdown
     citations = normalize_citations(opts[:citations])
     focus_terms = normalize_focus_terms(opts[:focus_terms])
 
     with {:ok, configured_extractous_opts} <- normalize_extractous_opts(config(:extractous, [])),
-         {:ok, request_extractous_opts} <- normalize_extractous_opts(Keyword.get(opts, :extractous, [])) do
+         {:ok, request_extractous_opts} <- normalize_extractous_opts(Keyword.get(opts, :extractous, [])),
+         {:ok, selector} <- normalize_selector(opts[:selector]),
+         {:ok, focus_window} <- normalize_integer_opt(:focus_window, Keyword.get(opts, :focus_window, 0), min: 0),
+         {:ok, timeout} <-
+           normalize_integer_opt(:timeout, Keyword.get(opts, :timeout, config(:timeout, @default_timeout)), min: 1),
+         {:ok, max_redirects} <-
+           normalize_integer_opt(:max_redirects, Keyword.get(opts, :max_redirects, @default_max_redirects), min: 0),
+         {:ok, cache_ttl_ms} <-
+           normalize_integer_opt(
+             :cache_ttl_ms,
+             Keyword.get(opts, :cache_ttl_ms, config(:cache_ttl_ms, @default_cache_ttl_ms)),
+             min: 0
+           ),
+         {:ok, max_content_tokens} <-
+           normalize_optional_integer_opt(:max_content_tokens, opts[:max_content_tokens], min: 1),
+         {:ok, max_url_length} <- normalize_optional_integer_opt(:max_url_length, opts[:max_url_length], min: 1),
+         {:ok, cache} <- normalize_boolean_opt(:cache, Keyword.get(opts, :cache, true)),
+         {:ok, require_known_url} <-
+           normalize_boolean_opt(:require_known_url, Keyword.get(opts, :require_known_url, false)) do
       cond do
         format not in @supported_formats ->
           {:error,
@@ -402,14 +400,18 @@ defmodule Jido.Browser.WebFetch do
           normalized =
             opts
             |> Keyword.put(:format, format)
+            |> Keyword.put(:selector, selector)
             |> Keyword.put(:citations, citations)
             |> Keyword.put(:focus_terms, focus_terms)
+            |> Keyword.put(:focus_window, focus_window)
+            |> Keyword.put(:timeout, timeout)
+            |> Keyword.put(:max_redirects, max_redirects)
+            |> Keyword.put(:cache, cache)
+            |> Keyword.put(:cache_ttl_ms, cache_ttl_ms)
+            |> Keyword.put(:require_known_url, require_known_url)
             |> Keyword.put(:extractous, merge_extractous_opts(configured_extractous_opts, request_extractous_opts))
-            |> Keyword.put_new(:focus_window, 0)
-            |> Keyword.put_new(:timeout, config(:timeout, @default_timeout))
-            |> Keyword.put_new(:max_redirects, @default_max_redirects)
-            |> Keyword.put_new(:cache, true)
-            |> Keyword.put_new(:cache_ttl_ms, config(:cache_ttl_ms, @default_cache_ttl_ms))
+            |> maybe_put(:max_content_tokens, max_content_tokens)
+            |> maybe_put(:max_url_length, max_url_length)
             |> Keyword.put_new(:known_urls, [])
 
           {:ok, normalized}
@@ -937,7 +939,7 @@ defmodule Jido.Browser.WebFetch do
 
   defp normalize_extractous_opts(opts) when is_list(opts) do
     if Keyword.keyword?(opts) do
-      {:ok, opts}
+      {:ok, canonicalize_keyword_list(opts)}
     else
       {:error,
        Error.invalid_error("Extractous options must be a keyword list", %{
@@ -955,6 +957,62 @@ defmodule Jido.Browser.WebFetch do
      })}
   end
 
+  defp normalize_selector(nil), do: {:ok, nil}
+
+  defp normalize_selector(selector) when is_binary(selector) do
+    selector
+    |> String.trim()
+    |> case do
+      "" -> {:ok, nil}
+      value -> {:ok, value}
+    end
+  end
+
+  defp normalize_selector(selector) do
+    {:error,
+     Error.invalid_error("Selector must be a string", %{
+       error_code: :invalid_input,
+       selector: selector
+     })}
+  end
+
+  defp normalize_integer_opt(_name, value, min: min) when is_integer(value) and value >= min, do: {:ok, value}
+
+  defp normalize_integer_opt(name, value, min: min) do
+    {:error,
+     Error.invalid_error("#{name} must be an integer greater than or equal to #{min}", %{
+       error_code: :invalid_input,
+       option: name,
+       value: value
+     })}
+  end
+
+  defp normalize_optional_integer_opt(_name, nil, _opts), do: {:ok, nil}
+  defp normalize_optional_integer_opt(name, value, opts), do: normalize_integer_opt(name, value, opts)
+
+  defp normalize_boolean_opt(_name, value) when is_boolean(value), do: {:ok, value}
+
+  defp normalize_boolean_opt(name, value) do
+    {:error,
+     Error.invalid_error("#{name} must be a boolean", %{
+       error_code: :invalid_input,
+       option: name,
+       value: value
+     })}
+  end
+
+  defp canonicalize_keyword_list(keyword_list) do
+    keyword_list
+    |> Enum.map(fn {key, value} = pair ->
+      if is_list(value) and Keyword.keyword?(value) do
+        {key, canonicalize_keyword_list(value)}
+      else
+        pair
+      end
+    end)
+    |> Enum.sort_by(fn {key, _value} -> to_string(key) end)
+  end
+
   defp merge_extractous_opts(left, right) do
     Keyword.merge(left, right, fn _key, left_value, right_value ->
       if Keyword.keyword?(left_value) and Keyword.keyword?(right_value) do
@@ -1045,6 +1103,8 @@ defmodule Jido.Browser.WebFetch do
 
   defp maybe_put_metadata(response, metadata) when metadata in [%{}, nil], do: response
   defp maybe_put_metadata(response, metadata), do: Map.put(response, :metadata, metadata)
+  defp maybe_put(opts, _key, nil), do: opts
+  defp maybe_put(opts, key, value), do: Keyword.put(opts, key, value)
 
   defp title_from_url(url) do
     path = URI.parse(url).path || ""
diff --git a/test/jido_browser/web_fetch_test.exs b/test/jido_browser/web_fetch_test.exs
index 43aad25..931ada8 100644
--- a/test/jido_browser/web_fetch_test.exs
+++ b/test/jido_browser/web_fetch_test.exs
@@ -22,6 +22,7 @@ defmodule Jido.Browser.WebFetchTest do
     test "fetches HTML content with selector extraction and citation passages" do
       expect(Req, :run, fn opts ->
         assert opts[:url] == "https://example.com/article"
+        assert opts[:decode_body] == false
 
         request = Req.Request.new(url: "https://example.com/article")
 
@@ -66,6 +67,27 @@ defmodule Jido.Browser.WebFetchTest do
       assert passage_text =~ "Hello"
     end
 
+    test "preserves JSON responses as text content" do
+      expect(Req, :run, fn opts ->
+        assert opts[:decode_body] == false
+        request = Req.Request.new(url: opts[:url])
+
+        response =
+          %Req.Response{
+            status: 200,
+            headers: %{"content-type" => ["application/json"]},
+            body: ~s({"name":"jido","kind":"agent"})
+          }
+
+        {request, response}
+      end)
+
+      assert {:ok, result} = Jido.Browser.web_fetch("https://example.com/data.json", format: :text)
+
+      assert result.document_type == :text
+      assert result.content =~ ~s("name":"jido")
+    end
+
     test "applies focused filtering to plain text responses" do
       expect(Req, :run, fn opts ->
         request = Req.Request.new(url: opts[:url])
@@ -203,6 +225,14 @@ defmodule Jido.Browser.WebFetchTest do
                )
     end
 
+    test "rejects invalid direct API options early" do
+      assert {:error, %Error.InvalidError{details: %{option: :timeout, error_code: :invalid_input}}} =
+               Jido.Browser.web_fetch("https://example.com/notes.txt", timeout: 0)
+
+      assert {:error, %Error.InvalidError{details: %{extractous: [:bad, :shape], error_code: :invalid_input}}} =
+               Jido.Browser.web_fetch("https://example.com/notes.txt", extractous: [:bad, :shape])
+    end
+
     test "enforces known URL provenance when requested" do
       assert {:error, %Error.InvalidError{details: %{error_code: :url_not_allowed}}} =
                Jido.Browser.web_fetch(

From 002dba516f34b3ce82ad6d4c00d6624a7deba901 Mon Sep 17 00:00:00 2001
From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com>
Date: Sat, 21 Mar 2026 20:25:08 -0500
Subject: [PATCH 6/7] docs: clarify web fetch API

---
 README.md           | 2 +-
 lib/jido_browser.ex | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f6f78bd..81940c0 100644
--- a/README.md
+++ b/README.md
@@ -92,7 +92,7 @@ Selectors remain supported, but ref-based interaction is the preferred 2.0 flow:
 
 result.content
 result.passages
-result.metadata
+result.metadata # present when extraction returns document metadata
 ```
 
 `web_fetch/2` keeps HTML handling native for selector extraction and markdown conversion, and uses `extractous_ex` for fetched binary documents such as PDFs, Word, Excel, PowerPoint, OpenDocument, EPUB, and common email formats. Binary document responses may also include `result.metadata` when extraction returns document metadata.
diff --git a/lib/jido_browser.ex b/lib/jido_browser.ex
index 91139ac..39a4424 100644
--- a/lib/jido_browser.ex
+++ b/lib/jido_browser.ex
@@ -109,7 +109,13 @@ defmodule Jido.Browser do
     end
   end
 
-  @doc "Fetches a URL over HTTP(S) without starting a browser session."
+  @doc """
+  Fetches a URL over HTTP(S) without starting a browser session.
+
+  HTML responses keep native selector extraction and format conversion, while
+  fetched binary documents such as PDFs and office files are extracted through
+  `ExtractousEx`.
+  """
   @spec web_fetch(String.t(), keyword()) :: {:ok, map()} | {:error, term()}
   def web_fetch(url, opts \\ [])
 

From 1d495fd9436b87c7f75d87bf98131dd3a5ce621d Mon Sep 17 00:00:00 2001
From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com>
Date: Sat, 21 Mar 2026 20:39:02 -0500
Subject: [PATCH 7/7] fix: resolve lint regressions

---
 lib/jido_browser/actions/web_fetch.ex |   5 +-
 lib/jido_browser/plugin.ex            |  17 +-
 lib/jido_browser/web_fetch.ex         | 277 +++++++++++++-------------
 3 files changed, 142 insertions(+), 157 deletions(-)

diff --git a/lib/jido_browser/actions/web_fetch.ex b/lib/jido_browser/actions/web_fetch.ex
index 7efcaae..fa79361 100644
--- a/lib/jido_browser/actions/web_fetch.ex
+++ b/lib/jido_browser/actions/web_fetch.ex
@@ -40,11 +40,8 @@ defmodule Jido.Browser.Actions.WebFetch do
          {:ok, result} <- Jido.Browser.web_fetch(params.url, build_opts(params, context)) do
       {:ok, Map.put(result, :status, "success")}
     else
-      {:error, %_{} = error} ->
+      {:error, error} ->
         {:error, error}
-
-      {:error, reason} ->
-        {:error, Error.adapter_error("Web fetch failed", %{reason: reason})}
     end
   end
 
diff --git a/lib/jido_browser/plugin.ex b/lib/jido_browser/plugin.ex
index 2a58bba..a99f841 100644
--- a/lib/jido_browser/plugin.ex
+++ b/lib/jido_browser/plugin.ex
@@ -285,7 +285,7 @@ defmodule Jido.Browser.Plugin do
     seen_urls =
       current_seen_urls
       |> Kernel.++(extract_urls(result))
-      |> Enum.reject(&is_nil_or_empty/1)
+      |> Enum.reject(&nil_or_empty?/1)
       |> Enum.uniq()
 
     if seen_urls == [] or seen_urls == current_seen_urls do
@@ -305,26 +305,23 @@ defmodule Jido.Browser.Plugin do
   defp extract_urls(result) do
     direct_urls =
       [Map.get(result, :url), Map.get(result, "url"), Map.get(result, :final_url), Map.get(result, "final_url")]
-      |> Enum.reject(&is_nil_or_empty/1)
+      |> Enum.reject(&nil_or_empty?/1)
 
     search_urls =
       result
       |> Map.get(:results, Map.get(result, "results", []))
       |> List.wrap()
       |> Enum.map(fn item ->
-        cond do
-          is_map(item) -> Map.get(item, :url) || Map.get(item, "url")
-          true -> nil
-        end
+        if is_map(item), do: Map.get(item, :url) || Map.get(item, "url")
       end)
-      |> Enum.reject(&is_nil_or_empty/1)
+      |> Enum.reject(&nil_or_empty?/1)
 
     direct_urls ++ search_urls
   end
 
-  defp is_nil_or_empty(nil), do: true
-  defp is_nil_or_empty(""), do: true
-  defp is_nil_or_empty(_value), do: false
+  defp nil_or_empty?(nil), do: true
+  defp nil_or_empty?(""), do: true
+  defp nil_or_empty?(_value), do: false
 
   def signal_patterns do
     [
diff --git a/lib/jido_browser/web_fetch.ex b/lib/jido_browser/web_fetch.ex
index 5c5cf81..1eae9aa 100644
--- a/lib/jido_browser/web_fetch.ex
+++ b/lib/jido_browser/web_fetch.ex
@@ -177,9 +177,6 @@ defmodule Jido.Browser.WebFetch do
 
       {_request, %_{} = exception} ->
         {:error, Error.adapter_error("Web fetch failed", %{error_code: :unavailable, reason: exception})}
-
-      {_request, reason} ->
-        {:error, Error.adapter_error("Web fetch failed", %{error_code: :unavailable, reason: reason})}
     end
   end
 
@@ -267,62 +264,49 @@ defmodule Jido.Browser.WebFetch do
      })}
   end
 
-  defp build_response(
-         url,
-         final_url,
-         content,
-         title,
-         content_type,
-         document_type,
-         opts,
-         truncated,
-         filtered,
-         focus_matches,
-         original_estimated_tokens,
-         metadata
-       ) do
-    passages = maybe_build_passages(content, title, final_url, opts[:citations])
+  defp build_response(opts, attrs) do
+    passages = maybe_build_passages(attrs.content, attrs.title, attrs.final_url, opts[:citations])
 
     %{
-      url: url,
-      final_url: final_url,
-      title: title,
-      content: content,
+      url: attrs.url,
+      final_url: attrs.final_url,
+      title: attrs.title,
+      content: attrs.content,
       format: opts[:format],
-      content_type: content_type,
-      document_type: document_type,
+      content_type: attrs.content_type,
+      document_type: attrs.document_type,
       retrieved_at: retrieved_at(),
-      estimated_tokens: estimate_tokens(content),
-      original_estimated_tokens: original_estimated_tokens,
-      truncated: truncated,
-      filtered: filtered,
-      focus_matches: focus_matches,
+      estimated_tokens: estimate_tokens(attrs.content),
+      original_estimated_tokens: attrs.original_estimated_tokens,
+      truncated: attrs.truncated,
+      filtered: attrs.filtered,
+      focus_matches: attrs.focus_matches,
       cached: false,
       citations: %{enabled: opts[:citations]},
       passages: passages
     }
-    |> maybe_put_metadata(metadata)
+    |> maybe_put_metadata(attrs.metadata)
   end
 
   defp finalize_result(url, final_url, content, title, content_type, document_type, opts, metadata \\ nil) do
     with {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts),
          {final_content, truncated, original_estimated_tokens} <-
            maybe_truncate(filtered_content, opts[:max_content_tokens]) do
-      {:ok,
-       build_response(
-         url,
-         final_url,
-         final_content,
-         title,
-         content_type,
-         document_type,
-         opts,
-         truncated,
-         filtered,
-         focus_matches,
-         original_estimated_tokens,
-         metadata
-       )}
+      attrs = %{
+        url: url,
+        final_url: final_url,
+        content: final_content,
+        title: title,
+        content_type: content_type,
+        document_type: document_type,
+        truncated: truncated,
+        filtered: filtered,
+        focus_matches: focus_matches,
+        original_estimated_tokens: original_estimated_tokens,
+        metadata: metadata
+      }
+
+      {:ok, build_response(opts, attrs)}
     end
   end
 
@@ -423,41 +407,10 @@ defmodule Jido.Browser.WebFetch do
     normalized_url = String.trim(url)
     max_url_length = opts[:max_url_length] || @default_max_url_length
 
-    cond do
-      normalized_url == "" ->
-        {:error, Error.invalid_error("URL cannot be empty", %{error_code: :invalid_input})}
-
-      String.length(normalized_url) > max_url_length ->
-        {:error,
-         Error.invalid_error("URL exceeds maximum length", %{
-           error_code: :url_too_long,
-           max_url_length: max_url_length
-         })}
-
-      true ->
-        uri = URI.parse(normalized_url)
-
-        cond do
-          uri.scheme not in ["http", "https"] ->
-            {:error,
-             Error.invalid_error("Web fetch only supports http and https URLs", %{
-               error_code: :invalid_input,
-               scheme: uri.scheme
-             })}
-
-          is_nil(uri.host) or uri.host == "" ->
-            {:error, Error.invalid_error("URL must include a host", %{error_code: :invalid_input})}
-
-          not ascii_only?(uri.host) ->
-            {:error,
-             Error.invalid_error("Web fetch only accepts ASCII hostnames", %{
-               error_code: :url_not_allowed,
-               host: uri.host
-             })}
-
-          true ->
-            {:ok, URI.to_string(uri), normalize_uri(uri)}
-        end
+    with :ok <- validate_url_length(normalized_url, max_url_length),
+         {:ok, uri} <- parse_fetch_uri(normalized_url),
+         :ok <- validate_uri_host(uri) do
+      {:ok, URI.to_string(uri), normalize_uri(uri)}
     end
   end
 
@@ -468,9 +421,7 @@ defmodule Jido.Browser.WebFetch do
       |> Enum.map(&normalize_known_url/1)
       |> Enum.reject(&is_nil/1)
 
-    if not Keyword.get(opts, :require_known_url, false) do
-      :ok
-    else
+    if Keyword.get(opts, :require_known_url, false) do
       if url in known_urls do
         :ok
       else
@@ -480,6 +431,8 @@ defmodule Jido.Browser.WebFetch do
            url: url
          })}
       end
+    else
+      :ok
     end
   end
 
@@ -566,7 +519,7 @@ defmodule Jido.Browser.WebFetch do
   end
 
   defp rule_matches?(%{host: host, path: path}, %URI{host: uri_host} = uri) do
-    uri_host = String.downcase(uri_host || "")
+    uri_host = String.downcase(uri_host)
     request_path = normalize_rule_path(uri.path || "/")
 
     host_matches? = uri_host == host or String.ends_with?(uri_host, "." <> host)
@@ -677,36 +630,10 @@ defmodule Jido.Browser.WebFetch do
 
       terms ->
         sections = split_sections(content)
-        downcased_terms = Enum.map(terms, &String.downcase/1)
-
-        matching_indexes =
-          sections
-          |> Enum.with_index()
-          |> Enum.flat_map(fn {section, index} ->
-            lowered = String.downcase(section)
-
-            if Enum.any?(downcased_terms, &String.contains?(lowered, &1)) do
-              [index]
-            else
-              []
-            end
-          end)
-
+        matching_indexes = matching_section_indexes(sections, terms)
         window = max(opts[:focus_window] || 0, 0)
-
-        kept_indexes =
-          matching_indexes
-          |> Enum.flat_map(fn index -> (index - window)..(index + window) end)
-          |> Enum.filter(&(&1 >= 0 and &1 < length(sections)))
-          |> Enum.uniq()
-          |> Enum.sort()
-
-        filtered_content =
-          kept_indexes
-          |> Enum.map(&Enum.at(sections, &1))
-          |> Enum.reject(&(&1 == ""))
-          |> Enum.join("\n\n")
-          |> String.trim()
+        kept_indexes = expand_focus_window(matching_indexes, window, length(sections))
+        filtered_content = render_section_slice(sections, kept_indexes)
 
         {:ok, filtered_content, true, length(matching_indexes)}
     end
@@ -768,19 +695,6 @@ defmodule Jido.Browser.WebFetch do
       {:ok, %{content: content, metadata: metadata}} when is_binary(content) ->
         {:ok, String.trim(content), normalize_metadata(metadata)}
 
-      {:ok, %{content: content}} when is_binary(content) ->
-        {:ok, String.trim(content), %{}}
-
-      {:ok, result} ->
-        {:error,
-         Error.adapter_error("ExtractousEx returned an unexpected document payload", %{
-           error_code: :unavailable,
-           url: final_url,
-           content_type: content_type,
-           document_type: document_type,
-           result: result
-         })}
-
       {:error, reason} ->
         {:error,
          Error.adapter_error("ExtractousEx failed while extracting document content", %{
@@ -806,25 +720,28 @@ defmodule Jido.Browser.WebFetch do
   defp fetch_cached(url, opts) do
     if opts[:cache] do
       ensure_cache_table!()
-      now = System.system_time(:millisecond)
-
-      case :ets.lookup(@cache_table, cache_key(url, opts)) do
-        [{_key, expires_at, result}] ->
-          if expires_at > now do
-            {:ok, Map.put(result, :cached, true)}
-          else
-            :ets.delete(@cache_table, cache_key(url, opts))
-            :miss
-          end
-
-        [] ->
-          :miss
-      end
+      lookup_cached_result(cache_key(url, opts), System.system_time(:millisecond))
     else
       :miss
     end
   end
 
+  defp lookup_cached_result(key, now) do
+    case :ets.lookup(@cache_table, key) do
+      [{_key, expires_at, result}] -> handle_cached_result(key, expires_at, result, now)
+      [] -> :miss
+    end
+  end
+
+  defp handle_cached_result(_key, expires_at, result, now) when expires_at > now do
+    {:ok, Map.put(result, :cached, true)}
+  end
+
+  defp handle_cached_result(key, _expires_at, _result, _now) do
+    :ets.delete(@cache_table, key)
+    :miss
+  end
+
   defp maybe_store_cache(url, opts, result) do
     if opts[:cache] do
       ensure_cache_table!()
@@ -976,6 +893,52 @@ defmodule Jido.Browser.WebFetch do
      })}
   end
 
+  defp validate_url_length("", _max_url_length) do
+    {:error, Error.invalid_error("URL cannot be empty", %{error_code: :invalid_input})}
+  end
+
+  defp validate_url_length(normalized_url, max_url_length) do
+    if String.length(normalized_url) > max_url_length do
+      {:error,
+       Error.invalid_error("URL exceeds maximum length", %{
+         error_code: :url_too_long,
+         max_url_length: max_url_length
+       })}
+    else
+      :ok
+    end
+  end
+
+  defp parse_fetch_uri(normalized_url) do
+    uri = URI.parse(normalized_url)
+
+    if uri.scheme in ["http", "https"] do
+      {:ok, uri}
+    else
+      {:error,
+       Error.invalid_error("Web fetch only supports http and https URLs", %{
+         error_code: :invalid_input,
+         scheme: uri.scheme
+       })}
+    end
+  end
+
+  defp validate_uri_host(%URI{host: host}) when host in [nil, ""] do
+    {:error, Error.invalid_error("URL must include a host", %{error_code: :invalid_input})}
+  end
+
+  defp validate_uri_host(%URI{host: host}) do
+    if ascii_only?(host) do
+      :ok
+    else
+      {:error,
+       Error.invalid_error("Web fetch only accepts ASCII hostnames", %{
+         error_code: :url_not_allowed,
+         host: host
+       })}
+    end
+  end
+
   defp normalize_integer_opt(_name, value, min: min) when is_integer(value) and value >= min, do: {:ok, value}
 
   defp normalize_integer_opt(name, value, min: min) do
@@ -1086,8 +1049,6 @@ defmodule Jido.Browser.WebFetch do
     end)
   end
 
-  defp metadata_title(_metadata), do: nil
-
   defp metadata_value_to_string(nil), do: nil
   defp metadata_value_to_string(value) when is_binary(value), do: String.trim(value)
 
@@ -1099,13 +1060,43 @@ defmodule Jido.Browser.WebFetch do
   defp metadata_value_to_string(_value), do: nil
 
   defp normalize_metadata(metadata) when is_map(metadata), do: metadata
-  defp normalize_metadata(_metadata), do: %{}
 
   defp maybe_put_metadata(response, metadata) when metadata in [%{}, nil], do: response
   defp maybe_put_metadata(response, metadata), do: Map.put(response, :metadata, metadata)
   defp maybe_put(opts, _key, nil), do: opts
   defp maybe_put(opts, key, value), do: Keyword.put(opts, key, value)
 
+  defp matching_section_indexes(sections, terms) do
+    downcased_terms = Enum.map(terms, &String.downcase/1)
+
+    sections
+    |> Enum.with_index()
+    |> Enum.flat_map(fn {section, index} ->
+      if section_matches_term?(section, downcased_terms), do: [index], else: []
+    end)
+  end
+
+  defp section_matches_term?(section, downcased_terms) do
+    lowered = String.downcase(section)
+    Enum.any?(downcased_terms, &String.contains?(lowered, &1))
+  end
+
+  defp expand_focus_window(matching_indexes, window, section_count) do
+    matching_indexes
+    |> Enum.flat_map(fn index -> (index - window)..(index + window) end)
+    |> Enum.filter(&(&1 >= 0 and &1 < section_count))
+    |> Enum.uniq()
+    |> Enum.sort()
+  end
+
+  defp render_section_slice(sections, indexes) do
+    indexes
+    |> Enum.map(&Enum.at(sections, &1))
+    |> Enum.reject(&(&1 == ""))
+    |> Enum.join("\n\n")
+    |> String.trim()
+  end
+
   defp title_from_url(url) do
     path = URI.parse(url).path || ""