From 10427687dde34acb3ac855f8ea51e1782bc605d1 Mon Sep 17 00:00:00 2001
From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com>
Date: Sat, 21 Mar 2026 19:59:43 -0500
Subject: [PATCH 1/7] feat: add HTTP-first web fetch tool
---
CHANGELOG.md | 6 +-
README.md | 26 +
lib/jido_browser.ex | 24 +
lib/jido_browser/actions/web_fetch.ex | 92 ++
lib/jido_browser/plugin.ex | 107 +-
lib/jido_browser/web_fetch.ex | 915 ++++++++++++++++++
mix.exs | 7 +-
.../composite_actions_and_installer_test.exs | 59 ++
test/jido_browser/plugin_test.exs | 44 +-
test/jido_browser/web_fetch_test.exs | 166 ++++
10 files changed, 1418 insertions(+), 28 deletions(-)
create mode 100644 lib/jido_browser/actions/web_fetch.ex
create mode 100644 lib/jido_browser/web_fetch.ex
create mode 100644 test/jido_browser/web_fetch_test.exs
diff --git a/CHANGELOG.md b/CHANGELOG.md
index fb19c91..99820c3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
+### Added
+
+- Add HTTP-first `Jido.Browser.web_fetch/2` and `Jido.Browser.Actions.WebFetch` for stateless page retrieval with domain policy, focused filtering, caching, and citation-ready passages
+
### Changed
- Rename the public Elixir namespace from `JidoBrowser.*` to `Jido.Browser.*`
@@ -110,4 +114,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Refactoring:
-* streamline agent-browser runtime defaults by mikehostetler
\ No newline at end of file
+* streamline agent-browser runtime defaults by mikehostetler
diff --git a/README.md b/README.md
index b7204cc..c1d1808 100644
--- a/README.md
+++ b/README.md
@@ -78,6 +78,22 @@ Selectors remain supported, but ref-based interaction is the preferred 2.0 flow:
2. act on `@eN` refs
3. re-snapshot
+### Stateless Web Fetch
+
+```elixir
+{:ok, result} =
+ Jido.Browser.web_fetch(
+ "https://example.com/docs",
+ format: :markdown,
+ allowed_domains: ["example.com"],
+ focus_terms: ["API", "authentication"],
+ citations: true
+ )
+
+result.content
+result.passages
+```
+
### State Persistence
```elixir
@@ -143,6 +159,14 @@ config :jido_browser, :web,
profile: "default"
```
+Optional web fetch settings:
+
+```elixir
+config :jido_browser, :web_fetch,
+ cache_ttl_ms: 300_000,
+ pdftotext_path: "/usr/local/bin/pdftotext"
+```
+
## Backends
### AgentBrowser (Default)
@@ -173,6 +197,7 @@ Core operations:
- `type/4`
- `screenshot/2`
- `extract_content/2`
+- `web_fetch/2`
- `evaluate/3`
Agent-browser-native operations:
@@ -252,6 +277,7 @@ Agent-browser-native operations:
- `ReadPage`
- `SnapshotUrl`
- `SearchWeb`
+- `WebFetch`
## Using With Jido Agents
diff --git a/lib/jido_browser.ex b/lib/jido_browser.ex
index 58f3e3a..91139ac 100644
--- a/lib/jido_browser.ex
+++ b/lib/jido_browser.ex
@@ -8,11 +8,13 @@ defmodule Jido.Browser do
alias Jido.Browser.Error
alias Jido.Browser.Session
+ alias Jido.Browser.WebFetch
@default_adapter Jido.Browser.Adapters.AgentBrowser
@default_timeout 30_000
@supported_screenshot_formats [:png]
@supported_extract_formats [:markdown, :html, :text]
+ @supported_web_fetch_formats [:markdown, :html, :text]
@doc "Starts a browser session using the configured adapter or an explicit adapter override."
@spec start_session(keyword()) :: {:ok, Session.t()} | {:error, term()}
@@ -107,6 +109,28 @@ defmodule Jido.Browser do
end
end
+ @doc "Fetches a URL over HTTP(S) without starting a browser session."
+ @spec web_fetch(String.t(), keyword()) :: {:ok, map()} | {:error, term()}
+ def web_fetch(url, opts \\ [])
+
+ def web_fetch(url, _opts) when url in [nil, ""] do
+ {:error, Error.invalid_error("URL cannot be nil or empty", %{url: url})}
+ end
+
+ def web_fetch(url, opts) when is_binary(url) do
+ format = opts[:format] || :markdown
+
+ if format in @supported_web_fetch_formats do
+ WebFetch.fetch(url, normalize_timeout(opts))
+ else
+ {:error,
+ Error.invalid_error("Unsupported web fetch format: #{inspect(format)}", %{
+ format: format,
+ supported: @supported_web_fetch_formats
+ })}
+ end
+ end
+
@doc "Evaluates JavaScript in the browser when the adapter supports it."
@spec evaluate(Session.t(), String.t(), keyword()) ::
{:ok, Session.t(), map()} | {:error, term()}
diff --git a/lib/jido_browser/actions/web_fetch.ex b/lib/jido_browser/actions/web_fetch.ex
new file mode 100644
index 0000000..417f07d
--- /dev/null
+++ b/lib/jido_browser/actions/web_fetch.ex
@@ -0,0 +1,92 @@
+defmodule Jido.Browser.Actions.WebFetch do
+ @moduledoc """
+ Stateless HTTP-first page retrieval for agent workflows.
+
+ `WebFetch` is a lighter-weight alternative to browser navigation when the
+ target content can be retrieved over plain HTTP(S) without JavaScript
+ execution.
+ """
+
+ use Jido.Action,
+ name: "web_fetch",
+ description:
+ "Fetch a URL over HTTP(S) with domain policy controls, optional focused filtering, " <>
+ "approximate token caps, and citation-ready passages.",
+ category: "Browser",
+ tags: ["browser", "web", "fetch", "http", "retrieval"],
+ vsn: "2.0.0",
+ schema: [
+ url: [type: :string, required: true, doc: "The URL to fetch"],
+ format: [type: {:in, [:markdown, :text, :html]}, default: :markdown, doc: "Output format"],
+ selector: [type: :string, doc: "Optional CSS selector for HTML pages"],
+ allowed_domains: [type: {:list, :string}, default: [], doc: "Allow-list of host or host/path rules"],
+ blocked_domains: [type: {:list, :string}, default: [], doc: "Block-list of host or host/path rules"],
+ focus_terms: [type: {:list, :string}, default: [], doc: "Terms used to filter the fetched document"],
+ focus_window: [type: :integer, default: 0, doc: "Paragraph window around each focus match"],
+ max_content_tokens: [type: :integer, doc: "Approximate token cap for returned content"],
+ citations: [type: :boolean, default: false, doc: "Include citation-ready passage offsets"],
+ cache: [type: :boolean, default: true, doc: "Reuse cached fetch results when available"],
+ timeout: [type: :integer, doc: "Receive timeout in milliseconds"],
+ require_known_url: [type: :boolean, default: false, doc: "Require the URL to already be present in tool context"],
+ known_urls: [type: {:list, :string}, default: [], doc: "Additional known URLs accepted for provenance checks"],
+ max_uses: [type: :integer, doc: "Maximum successful web fetch calls allowed in current skill state"]
+ ]
+
+ alias Jido.Browser.Error
+
+ @impl true
+ def run(params, context) do
+ with :ok <- validate_max_uses(params, context),
+ {:ok, result} <- Jido.Browser.web_fetch(params.url, build_opts(params, context)) do
+ {:ok, Map.put(result, :status, "success")}
+ else
+ {:error, %_{} = error} ->
+ {:error, error}
+
+ {:error, reason} ->
+ {:error, Error.adapter_error("Web fetch failed", %{reason: reason})}
+ end
+ end
+
+ defp build_opts(params, context) do
+ known_urls =
+ (Map.get(params, :known_urls, []) || [])
+ |> Kernel.++(get_in(context, [:skill_state, :seen_urls]) || [])
+ |> Enum.uniq()
+
+ []
+ |> maybe_put(:format, Map.get(params, :format, :markdown))
+ |> maybe_put(:selector, params[:selector])
+ |> maybe_put(:allowed_domains, Map.get(params, :allowed_domains, []))
+ |> maybe_put(:blocked_domains, Map.get(params, :blocked_domains, []))
+ |> maybe_put(:focus_terms, Map.get(params, :focus_terms, []))
+ |> maybe_put(:focus_window, Map.get(params, :focus_window, 0))
+ |> maybe_put(:max_content_tokens, params[:max_content_tokens])
+ |> maybe_put(:citations, Map.get(params, :citations, false))
+ |> maybe_put(:cache, Map.get(params, :cache, true))
+ |> maybe_put(:timeout, params[:timeout])
+ |> maybe_put(:require_known_url, Map.get(params, :require_known_url, false))
+ |> maybe_put(:known_urls, known_urls)
+ end
+
+ defp validate_max_uses(%{max_uses: max_uses}, context) when is_integer(max_uses) and max_uses >= 0 do
+ current_uses = get_in(context, [:skill_state, :web_fetch_uses]) || 0
+
+ if current_uses >= max_uses do
+ {:error,
+ Error.invalid_error("Web fetch max uses exceeded", %{
+ error_code: :max_uses_exceeded,
+ max_uses: max_uses,
+ current_uses: current_uses
+ })}
+ else
+ :ok
+ end
+ end
+
+ defp validate_max_uses(_params, _context), do: :ok
+
+ defp maybe_put(opts, _key, nil), do: opts
+ defp maybe_put(opts, _key, []), do: opts
+ defp maybe_put(opts, key, value), do: Keyword.put(opts, key, value)
+end
diff --git a/lib/jido_browser/plugin.ex b/lib/jido_browser/plugin.ex
index 4e99408..2a58bba 100644
--- a/lib/jido_browser/plugin.ex
+++ b/lib/jido_browser/plugin.ex
@@ -36,6 +36,7 @@ require Jido.Browser.Actions.WaitForSelector
require Jido.Browser.Actions.ReadPage
require Jido.Browser.Actions.SnapshotUrl
require Jido.Browser.Actions.SearchWeb
+require Jido.Browser.Actions.WebFetch
defmodule Jido.Browser.Plugin do
@moduledoc """
@@ -119,7 +120,8 @@ defmodule Jido.Browser.Plugin do
# Self-contained composite actions (manage own session)
Jido.Browser.Actions.ReadPage,
Jido.Browser.Actions.SnapshotUrl,
- Jido.Browser.Actions.SearchWeb
+ Jido.Browser.Actions.SearchWeb,
+ Jido.Browser.Actions.WebFetch
],
description: "Browser automation for web navigation, interaction, and content extraction",
category: "browser",
@@ -136,7 +138,9 @@ defmodule Jido.Browser.Plugin do
viewport: Map.get(config, :viewport, %{width: 1280, height: 720}),
base_url: Map.get(config, :base_url),
last_url: nil,
- last_title: nil
+ last_title: nil,
+ seen_urls: [],
+ web_fetch_uses: 0
}
{:ok, initial_state}
@@ -151,7 +155,9 @@ defmodule Jido.Browser.Plugin do
viewport: Zoi.any(description: "Browser viewport dimensions") |> Zoi.optional(),
base_url: Zoi.string(description: "Base URL for relative navigation") |> Zoi.optional(),
last_url: Zoi.string(description: "Last navigated URL") |> Zoi.optional(),
- last_title: Zoi.string(description: "Last page title") |> Zoi.optional()
+ last_title: Zoi.string(description: "Last page title") |> Zoi.optional(),
+ seen_urls: Zoi.array(Zoi.string(description: "Known URLs discovered during tool use")) |> Zoi.default([]),
+ web_fetch_uses: Zoi.integer(description: "Successful web fetch calls in current skill state") |> Zoi.default(0)
})
end
@@ -204,7 +210,8 @@ defmodule Jido.Browser.Plugin do
# Self-contained composite actions
{"browser.read_page", Jido.Browser.Actions.ReadPage},
{"browser.snapshot_url", Jido.Browser.Actions.SnapshotUrl},
- {"browser.search_web", Jido.Browser.Actions.SearchWeb}
+ {"browser.search_web", Jido.Browser.Actions.SearchWeb},
+ {"browser.web_fetch", Jido.Browser.Actions.WebFetch}
]
end
@@ -214,22 +221,17 @@ defmodule Jido.Browser.Plugin do
end
@impl Jido.Plugin
- def transform_result(_action, {:ok, result}, _context) when is_map(result) do
- case Map.get(result, :session) do
- %Jido.Browser.Session{} = session ->
- current_url = Map.get(result, :url) || Map.get(result, "url") || get_in(session, [:connection, :current_url])
- current_title = Map.get(result, :title) || Map.get(result, "title") || get_in(session, [:connection, :title])
-
- state_updates = %{
- session: session,
- last_url: current_url,
- last_title: current_title
- }
-
- {:ok, result, state_updates}
+ def transform_result(action, {:ok, result}, context) when is_map(result) do
+ state_updates =
+ %{}
+ |> maybe_put_session_state(result)
+ |> maybe_put_seen_urls(result, context)
+ |> maybe_increment_web_fetch_uses(action, context)
- _ ->
- {:ok, result}
+ if map_size(state_updates) == 0 do
+ {:ok, result}
+ else
+ {:ok, result, state_updates}
end
end
@@ -260,6 +262,70 @@ defmodule Jido.Browser.Plugin do
end
end
+ defp maybe_put_session_state(acc, result) do
+ case Map.get(result, :session) do
+ %Jido.Browser.Session{} = session ->
+ current_url = Map.get(result, :url) || Map.get(result, "url") || get_in(session, [:connection, :current_url])
+ current_title = Map.get(result, :title) || Map.get(result, "title") || get_in(session, [:connection, :title])
+
+ Map.merge(acc, %{
+ session: session,
+ last_url: current_url,
+ last_title: current_title
+ })
+
+ _ ->
+ acc
+ end
+ end
+
+ defp maybe_put_seen_urls(acc, result, context) do
+ current_seen_urls = get_in(context, [:skill_state, :seen_urls]) || []
+
+ seen_urls =
+ current_seen_urls
+ |> Kernel.++(extract_urls(result))
+ |> Enum.reject(&is_nil_or_empty/1)
+ |> Enum.uniq()
+
+ if seen_urls == [] or seen_urls == current_seen_urls do
+ acc
+ else
+ Map.put(acc, :seen_urls, seen_urls)
+ end
+ end
+
+ defp maybe_increment_web_fetch_uses(acc, Jido.Browser.Actions.WebFetch, context) do
+ current_uses = get_in(context, [:skill_state, :web_fetch_uses]) || 0
+ Map.put(acc, :web_fetch_uses, current_uses + 1)
+ end
+
+ defp maybe_increment_web_fetch_uses(acc, _action, _context), do: acc
+
+ defp extract_urls(result) do
+ direct_urls =
+ [Map.get(result, :url), Map.get(result, "url"), Map.get(result, :final_url), Map.get(result, "final_url")]
+ |> Enum.reject(&is_nil_or_empty/1)
+
+ search_urls =
+ result
+ |> Map.get(:results, Map.get(result, "results", []))
+ |> List.wrap()
+ |> Enum.map(fn item ->
+ cond do
+ is_map(item) -> Map.get(item, :url) || Map.get(item, "url")
+ true -> nil
+ end
+ end)
+ |> Enum.reject(&is_nil_or_empty/1)
+
+ direct_urls ++ search_urls
+ end
+
+ defp is_nil_or_empty(nil), do: true
+ defp is_nil_or_empty(""), do: true
+ defp is_nil_or_empty(_value), do: false
+
def signal_patterns do
[
# Session lifecycle
@@ -308,7 +374,8 @@ defmodule Jido.Browser.Plugin do
# Self-contained composite actions
"browser.read_page",
"browser.snapshot_url",
- "browser.search_web"
+ "browser.search_web",
+ "browser.web_fetch"
]
end
end
diff --git a/lib/jido_browser/web_fetch.ex b/lib/jido_browser/web_fetch.ex
new file mode 100644
index 0000000..8599598
--- /dev/null
+++ b/lib/jido_browser/web_fetch.ex
@@ -0,0 +1,915 @@
+defmodule Jido.Browser.WebFetch do
+ @moduledoc """
+ Stateless HTTP-first web retrieval with optional domain policy, caching,
+ focused filtering, and citation-ready passage metadata.
+
+ This module is intended for document retrieval workloads where starting a full
+ browser session would be unnecessary or too expensive.
+ """
+
+ alias Jido.Browser.Error
+
+ @cache_table :jido_browser_web_fetch_cache
+ @default_timeout 15_000
+ @default_max_redirects 5
+ @default_cache_ttl_ms 300_000
+ @default_max_url_length 2_048
+ @supported_formats [:markdown, :text, :html]
+ @html_content_types ["text/html", "application/xhtml+xml"]
+ @text_content_types ["text/plain", "text/markdown", "text/csv", "text/xml", "application/xml"]
+ @pdf_content_types ["application/pdf"]
+
+ @type result :: %{
+ required(:url) => String.t(),
+ required(:final_url) => String.t(),
+ required(:content) => String.t(),
+ required(:format) => atom(),
+ required(:content_type) => String.t(),
+ required(:document_type) => atom(),
+ required(:retrieved_at) => String.t(),
+ required(:estimated_tokens) => non_neg_integer(),
+ required(:original_estimated_tokens) => non_neg_integer(),
+ required(:truncated) => boolean(),
+ required(:filtered) => boolean(),
+ required(:focus_matches) => non_neg_integer(),
+ required(:cached) => boolean(),
+ required(:citations) => %{enabled: boolean()},
+ required(:passages) => list(map()),
+ optional(:title) => String.t() | nil
+ }
+
+ @doc """
+ Fetches a URL over HTTP(S) and returns normalized document content.
+
+ Supported options:
+ - `:format` - `:markdown`, `:text`, or `:html`
+ - `:selector` - CSS selector for HTML pages
+ - `:allowed_domains` / `:blocked_domains` - mutually exclusive host/path rules
+ - `:max_content_tokens` - approximate token cap
+ - `:citations` - boolean, when true include passage spans
+ - `:focus_terms` - list of terms used for focused filtering
+ - `:focus_window` - paragraph window around focus matches
+ - `:timeout` - receive timeout in milliseconds
+ - `:cache` - enable ETS cache, defaults to `true`
+ - `:cache_ttl_ms` - cache TTL in milliseconds
+ - `:require_known_url` / `:known_urls` - optional URL provenance guard
+ """
+ @spec fetch(String.t(), keyword()) :: {:ok, result()} | {:error, Exception.t()}
+ def fetch(url, opts \\ [])
+
+ def fetch(url, opts) when is_binary(url) and is_list(opts) do
+ with {:ok, opts} <- normalize_opts(opts),
+ {:ok, normalized_url, uri} <- validate_url(url, opts),
+ :ok <- validate_known_url(normalized_url, opts),
+ :ok <- validate_domain_filters(uri, opts) do
+ case fetch_cached(normalized_url, opts) do
+ {:ok, result} ->
+ {:ok, result}
+
+ :miss ->
+ do_fetch(normalized_url, opts)
+ end
+ end
+ end
+
+ def fetch(url, _opts) do
+ {:error, Error.invalid_error("URL must be a non-empty string", %{error_code: :invalid_input, url: url})}
+ end
+
+ @doc false
+ @spec clear_cache() :: :ok
+ def clear_cache do
+ case :ets.whereis(@cache_table) do
+ :undefined ->
+ :ok
+
+ table ->
+ :ets.delete_all_objects(table)
+ :ok
+ end
+ end
+
+ defp do_fetch(url, opts) do
+ request_opts = [
+ url: url,
+ headers: request_headers(),
+ receive_timeout: opts[:timeout],
+ redirect: true,
+ max_redirects: opts[:max_redirects]
+ ]
+
+ case Req.run(request_opts) do
+ {%Req.Request{} = request, %Req.Response{} = response} ->
+ with :ok <- validate_http_status(response, url),
+ {:ok, final_url, final_uri} <- normalize_final_url(request),
+ :ok <- validate_domain_filters(final_uri, opts),
+ {:ok, result} <- build_result(url, final_url, response, opts) do
+ maybe_store_cache(url, opts, result)
+ {:ok, result}
+ end
+
+ {_request, %Req.TransportError{} = exception} ->
+ {:error, Error.adapter_error("Web fetch request failed", %{error_code: :url_not_accessible, reason: exception})}
+
+ {_request, %Req.TooManyRedirectsError{} = exception} ->
+ {:error,
+ Error.adapter_error("Web fetch exceeded redirect limit", %{error_code: :url_not_accessible, reason: exception})}
+
+ {_request, %_{} = exception} ->
+ {:error, Error.adapter_error("Web fetch failed", %{error_code: :unavailable, reason: exception})}
+
+ {_request, reason} ->
+ {:error, Error.adapter_error("Web fetch failed", %{error_code: :unavailable, reason: reason})}
+ end
+ end
+
+ defp build_result(url, final_url, response, opts) do
+ content_type = response_content_type(response)
+
+ cond do
+ content_type in @html_content_types ->
+ build_html_result(url, final_url, response.body, content_type, opts)
+
+ content_type in @pdf_content_types ->
+ build_pdf_result(url, final_url, response.body, content_type, opts)
+
+ text_content_type?(content_type) ->
+ build_text_result(url, final_url, response.body, content_type, opts)
+
+ true ->
+ {:error,
+ Error.adapter_error("Unsupported content type for web fetch", %{
+ error_code: :unsupported_content_type,
+ content_type: content_type
+ })}
+ end
+ end
+
+ defp build_html_result(url, final_url, body, content_type, opts) when is_binary(body) do
+ selector = opts[:selector]
+
+ with {:ok, document} <- parse_document(body),
+ {:ok, html} <- select_html(document, body, selector),
+ {:ok, title} <- extract_title(document),
+ {:ok, content} <- format_html(html, opts[:format], opts),
+ {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts),
+ {final_content, truncated, original_estimated_tokens} <-
+ maybe_truncate(filtered_content, opts[:max_content_tokens]) do
+ {:ok,
+ build_response(
+ url,
+ final_url,
+ final_content,
+ title,
+ content_type,
+ :html,
+ opts,
+ truncated,
+ filtered,
+ focus_matches,
+ original_estimated_tokens
+ )}
+ end
+ end
+
+ defp build_html_result(_url, _final_url, body, content_type, _opts) do
+ {:error,
+ Error.adapter_error("Unexpected response body for HTML fetch", %{
+ error_code: :unavailable,
+ content_type: content_type,
+ body: body
+ })}
+ end
+
+ defp build_text_result(url, final_url, body, content_type, opts) when is_binary(body) do
+ if opts[:selector] do
+ {:error,
+ Error.invalid_error("Selector filtering is only supported for HTML content", %{
+ error_code: :invalid_input,
+ selector: opts[:selector],
+ content_type: content_type
+ })}
+ else
+ with {:ok, content} <- format_text(body, opts[:format]),
+ {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts),
+ {final_content, truncated, original_estimated_tokens} <-
+ maybe_truncate(filtered_content, opts[:max_content_tokens]) do
+ {:ok,
+ build_response(
+ url,
+ final_url,
+ final_content,
+ nil,
+ content_type,
+ :text,
+ opts,
+ truncated,
+ filtered,
+ focus_matches,
+ original_estimated_tokens
+ )}
+ end
+ end
+ end
+
+ defp build_text_result(_url, _final_url, body, content_type, _opts) do
+ {:error,
+ Error.adapter_error("Unexpected response body for text fetch", %{
+ error_code: :unavailable,
+ content_type: content_type,
+ body: body
+ })}
+ end
+
+ defp build_pdf_result(url, final_url, body, content_type, opts) when is_binary(body) do
+ cond do
+ opts[:selector] ->
+ {:error,
+ Error.invalid_error("Selector filtering is not supported for PDF content", %{
+ error_code: :invalid_input,
+ selector: opts[:selector],
+ content_type: content_type
+ })}
+
+ opts[:format] == :html ->
+ {:error,
+ Error.invalid_error("HTML output is not supported for PDF content", %{
+ error_code: :invalid_input,
+ format: :html,
+ content_type: content_type
+ })}
+
+ true ->
+ with {:ok, text} <- extract_pdf_text(body),
+ {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(text, opts),
+ {final_content, truncated, original_estimated_tokens} <-
+ maybe_truncate(filtered_content, opts[:max_content_tokens]) do
+ {:ok,
+ build_response(
+ url,
+ final_url,
+ final_content,
+ title_from_url(final_url),
+ content_type,
+ :pdf,
+ opts,
+ truncated,
+ filtered,
+ focus_matches,
+ original_estimated_tokens
+ )}
+ end
+ end
+ end
+
+ defp build_pdf_result(_url, _final_url, body, content_type, _opts) do
+ {:error,
+ Error.adapter_error("Unexpected response body for PDF fetch", %{
+ error_code: :unavailable,
+ content_type: content_type,
+ body: body
+ })}
+ end
+
+ defp build_response(
+ url,
+ final_url,
+ content,
+ title,
+ content_type,
+ document_type,
+ opts,
+ truncated,
+ filtered,
+ focus_matches,
+ original_estimated_tokens
+ ) do
+ passages = maybe_build_passages(content, title, final_url, opts[:citations])
+
+ %{
+ url: url,
+ final_url: final_url,
+ title: title,
+ content: content,
+ format: opts[:format],
+ content_type: content_type,
+ document_type: document_type,
+ retrieved_at: retrieved_at(),
+ estimated_tokens: estimate_tokens(content),
+ original_estimated_tokens: original_estimated_tokens,
+ truncated: truncated,
+ filtered: filtered,
+ focus_matches: focus_matches,
+ cached: false,
+ citations: %{enabled: opts[:citations]},
+ passages: passages
+ }
+ end
+
+ defp normalize_opts(opts) do
+ format = opts[:format] || :markdown
+ citations = normalize_citations(opts[:citations])
+ focus_terms = normalize_focus_terms(opts[:focus_terms])
+
+ cond do
+ format not in @supported_formats ->
+ {:error,
+ Error.invalid_error("Unsupported web fetch format", %{
+ error_code: :invalid_input,
+ format: format,
+ supported_formats: @supported_formats
+ })}
+
+ present_domain_rules?(opts[:allowed_domains]) and present_domain_rules?(opts[:blocked_domains]) ->
+ {:error,
+ Error.invalid_error("Use either allowed_domains or blocked_domains, not both", %{
+ error_code: :invalid_input
+ })}
+
+ format == :html and focus_terms != [] ->
+ {:error,
+ Error.invalid_error("Focused filtering is only supported for markdown and text output", %{
+ error_code: :invalid_input,
+ format: format
+ })}
+
+ true ->
+ normalized =
+ opts
+ |> Keyword.put(:format, format)
+ |> Keyword.put(:citations, citations)
+ |> Keyword.put(:focus_terms, focus_terms)
+ |> Keyword.put_new(:focus_window, 0)
+ |> Keyword.put_new(:timeout, config(:timeout, @default_timeout))
+ |> Keyword.put_new(:max_redirects, @default_max_redirects)
+ |> Keyword.put_new(:cache, true)
+ |> Keyword.put_new(:cache_ttl_ms, config(:cache_ttl_ms, @default_cache_ttl_ms))
+ |> Keyword.put_new(:known_urls, [])
+
+ {:ok, normalized}
+ end
+ end
+
+ defp validate_url(url, opts) do
+ normalized_url = String.trim(url)
+ max_url_length = opts[:max_url_length] || @default_max_url_length
+
+ cond do
+ normalized_url == "" ->
+ {:error, Error.invalid_error("URL cannot be empty", %{error_code: :invalid_input})}
+
+ String.length(normalized_url) > max_url_length ->
+ {:error,
+ Error.invalid_error("URL exceeds maximum length", %{
+ error_code: :url_too_long,
+ max_url_length: max_url_length
+ })}
+
+ true ->
+ uri = URI.parse(normalized_url)
+
+ cond do
+ uri.scheme not in ["http", "https"] ->
+ {:error,
+ Error.invalid_error("Web fetch only supports http and https URLs", %{
+ error_code: :invalid_input,
+ scheme: uri.scheme
+ })}
+
+ is_nil(uri.host) or uri.host == "" ->
+ {:error, Error.invalid_error("URL must include a host", %{error_code: :invalid_input})}
+
+ not ascii_only?(uri.host) ->
+ {:error,
+ Error.invalid_error("Web fetch only accepts ASCII hostnames", %{
+ error_code: :url_not_allowed,
+ host: uri.host
+ })}
+
+ true ->
+ {:ok, URI.to_string(uri), normalize_uri(uri)}
+ end
+ end
+ end
+
+ defp validate_known_url(url, opts) do
+ known_urls =
+ opts[:known_urls]
+ |> List.wrap()
+ |> Enum.map(&normalize_known_url/1)
+ |> Enum.reject(&is_nil/1)
+
+ if not Keyword.get(opts, :require_known_url, false) do
+ :ok
+ else
+ if url in known_urls do
+ :ok
+ else
+ {:error,
+ Error.invalid_error("Web fetch URL must already be present in tool context", %{
+ error_code: :url_not_allowed,
+ url: url
+ })}
+ end
+ end
+ end
+
+ defp validate_domain_filters(%URI{} = uri, opts) do
+ with {:ok, allowed_rules} <- normalize_domain_rules(opts[:allowed_domains]),
+ {:ok, blocked_rules} <- normalize_domain_rules(opts[:blocked_domains]) do
+ cond do
+ allowed_rules != [] and not Enum.any?(allowed_rules, &rule_matches?(&1, uri)) ->
+ {:error,
+ Error.invalid_error("URL is not permitted by allowed_domains", %{
+ error_code: :url_not_allowed,
+ url: URI.to_string(uri)
+ })}
+
+ blocked_rules != [] and Enum.any?(blocked_rules, &rule_matches?(&1, uri)) ->
+ {:error,
+ Error.invalid_error("URL is blocked by blocked_domains", %{
+ error_code: :url_not_allowed,
+ url: URI.to_string(uri)
+ })}
+
+ true ->
+ :ok
+ end
+ end
+ end
+
+ defp normalize_domain_rules(nil), do: {:ok, []}
+
+ defp normalize_domain_rules(rules) do
+ rules
+ |> List.wrap()
+ |> Enum.reduce_while({:ok, []}, fn rule, {:ok, acc} ->
+ case normalize_domain_rule(rule) do
+ {:ok, normalized} -> {:cont, {:ok, [normalized | acc]}}
+ {:error, reason} -> {:halt, {:error, reason}}
+ end
+ end)
+ |> case do
+ {:ok, normalized} -> {:ok, Enum.reverse(normalized)}
+ error -> error
+ end
+ end
+
+ defp normalize_domain_rule(rule) when is_binary(rule) do
+ normalized = String.trim(rule)
+
+ cond do
+ normalized == "" ->
+ {:error, Error.invalid_error("Domain rules cannot be empty", %{error_code: :invalid_input})}
+
+ String.contains?(normalized, "://") ->
+ {:error,
+ Error.invalid_error("Domain rules must not include URL schemes", %{
+ error_code: :invalid_input,
+ rule: normalized
+ })}
+
+ true ->
+ uri = URI.parse("https://" <> normalized)
+ host = String.downcase(uri.host || "")
+ path = uri.path || "/"
+
+ cond do
+ host == "" ->
+ {:error,
+ Error.invalid_error("Domain rule must include a host", %{error_code: :invalid_input, rule: normalized})}
+
+ not ascii_only?(host) ->
+ {:error,
+ Error.invalid_error("Domain rules must use ASCII hosts", %{
+ error_code: :invalid_input,
+ rule: normalized
+ })}
+
+ true ->
+ {:ok, %{host: host, path: normalize_rule_path(path)}}
+ end
+ end
+ end
+
+ defp normalize_domain_rule(rule) do
+ {:error, Error.invalid_error("Domain rule must be a string", %{error_code: :invalid_input, rule: rule})}
+ end
+
+ defp rule_matches?(%{host: host, path: path}, %URI{host: uri_host} = uri) do
+ uri_host = String.downcase(uri_host || "")
+ request_path = normalize_rule_path(uri.path || "/")
+
+ host_matches? = uri_host == host or String.ends_with?(uri_host, "." <> host)
+ path_matches? = path == "/" or String.starts_with?(request_path, path)
+
+ host_matches? and path_matches?
+ end
+
+ defp normalize_final_url(%Req.Request{url: %URI{} = uri}) do
+ normalized = normalize_uri(uri)
+ {:ok, URI.to_string(normalized), normalized}
+ end
+
+ defp validate_http_status(%Req.Response{status: status}, _url) when status in 200..299, do: :ok
+
+ defp validate_http_status(%Req.Response{status: 429}, _url) do
+ {:error, Error.adapter_error("Web fetch rate limited", %{error_code: :too_many_requests, status: 429})}
+ end
+
+ defp validate_http_status(%Req.Response{status: status}, url) do
+ {:error,
+ Error.adapter_error("Web fetch returned an HTTP error", %{
+ error_code: :url_not_accessible,
+ status: status,
+ url: url
+ })}
+ end
+
+ defp parse_document(body) do
+ case Floki.parse_document(body) do
+ {:ok, document} ->
+ {:ok, document}
+
+ {:error, reason} ->
+ {:error, Error.adapter_error("Failed to parse fetched HTML", %{error_code: :unavailable, reason: reason})}
+ end
+ end
+
+ defp select_html(_document, body, nil), do: {:ok, body}
+ defp select_html(document, _body, ""), do: select_html(document, nil, nil)
+
+ defp select_html(document, _body, selector) do
+ nodes = Floki.find(document, selector)
+
+ if nodes == [] do
+ {:error,
+ Error.invalid_error("Selector did not match any elements in fetched HTML", %{
+ error_code: :invalid_input,
+ selector: selector
+ })}
+ else
+ {:ok, Floki.raw_html(nodes)}
+ end
+ end
+
+ defp extract_title(document) do
+ title =
+ document
+ |> Floki.find("title")
+ |> Floki.text(sep: " ")
+ |> String.trim()
+ |> blank_to_nil()
+
+ {:ok, title}
+ end
+
+ defp format_html(html, :html, _opts), do: {:ok, html}
+
+ defp format_html(html, :text, _opts) do
+ with {:ok, fragment} <- parse_fragment(html) do
+ {:ok, fragment |> Floki.text(sep: "\n") |> String.trim()}
+ end
+ end
+
+ defp format_html(html, :markdown, _opts) do
+ {:ok, Html2Markdown.convert(html) |> String.trim()}
+ rescue
+ error ->
+ {:error,
+ Error.adapter_error("Failed to convert fetched HTML to markdown", %{error_code: :unavailable, reason: error})}
+ end
+
+ defp format_text(text, :text), do: {:ok, String.trim(text)}
+ defp format_text(text, :markdown), do: {:ok, String.trim(text)}
+
+ defp format_text(_text, :html) do
+ {:error,
+ Error.invalid_error("HTML output is only supported for HTML content", %{
+ error_code: :invalid_input
+ })}
+ end
+
+ defp parse_fragment(html) do
+ case Floki.parse_fragment(html) do
+ {:ok, fragment} ->
+ {:ok, fragment}
+
+ {:error, reason} ->
+ {:error,
+ Error.adapter_error("Failed to parse fetched HTML fragment", %{error_code: :unavailable, reason: reason})}
+ end
+ end
+
+ defp maybe_filter_content(content, opts) do
+ case opts[:focus_terms] do
+ [] ->
+ {:ok, content, false, 0}
+
+ terms ->
+ sections = split_sections(content)
+ downcased_terms = Enum.map(terms, &String.downcase/1)
+
+ matching_indexes =
+ sections
+ |> Enum.with_index()
+ |> Enum.flat_map(fn {section, index} ->
+ lowered = String.downcase(section)
+
+ if Enum.any?(downcased_terms, &String.contains?(lowered, &1)) do
+ [index]
+ else
+ []
+ end
+ end)
+
+ window = max(opts[:focus_window] || 0, 0)
+
+ kept_indexes =
+ matching_indexes
+ |> Enum.flat_map(fn index -> (index - window)..(index + window) end)
+ |> Enum.filter(&(&1 >= 0 and &1 < length(sections)))
+ |> Enum.uniq()
+ |> Enum.sort()
+
+ filtered_content =
+ kept_indexes
+ |> Enum.map(&Enum.at(sections, &1))
+ |> Enum.reject(&(&1 == ""))
+ |> Enum.join("\n\n")
+ |> String.trim()
+
+ {:ok, filtered_content, true, length(matching_indexes)}
+ end
+ end
+
+ defp maybe_truncate(content, nil), do: {content, false, estimate_tokens(content)}
+
+ defp maybe_truncate(content, max_content_tokens) when is_integer(max_content_tokens) and max_content_tokens > 0 do
+ original_estimated_tokens = estimate_tokens(content)
+
+ if original_estimated_tokens <= max_content_tokens do
+ {content, false, original_estimated_tokens}
+ else
+ char_limit = max_content_tokens * 4
+ truncated = String.slice(content, 0, char_limit) |> String.trim()
+ {truncated, true, original_estimated_tokens}
+ end
+ end
+
+ defp maybe_truncate(content, _other), do: {content, false, estimate_tokens(content)}
+
+ defp maybe_build_passages(_content, _title, _url, false), do: []
+
+ defp maybe_build_passages(content, title, url, true) do
+ content
+ |> split_sections()
+ |> Enum.reject(&(&1 == ""))
+ |> Enum.reduce({[], 0, 0}, fn section, {passages, cursor, index} ->
+ start_char = cursor
+ end_char = start_char + String.length(section)
+
+ passage = %{
+ index: index,
+ start_char: start_char,
+ end_char: end_char,
+ text: section,
+ title: title,
+ url: url
+ }
+
+ {[passage | passages], end_char + 2, index + 1}
+ end)
+ |> elem(0)
+ |> Enum.reverse()
+ |> Enum.take(50)
+ end
+
+ defp split_sections(content) do
+ content
+ |> String.split(~r/\n\s*\n+/, trim: true)
+ |> case do
+ [] -> [String.trim(content)]
+ sections -> Enum.map(sections, &String.trim/1)
+ end
+ end
+
+ defp extract_pdf_text(bytes) do
+ case pdftotext_path() do
+ nil ->
+ {:error,
+ Error.adapter_error("PDF extraction requires pdftotext to be installed", %{
+ error_code: :unsupported_content_type,
+ content_type: "application/pdf"
+ })}
+
+ binary ->
+ with_tmp_files("jido_browser_web_fetch", ".pdf", ".txt", fn pdf_path, txt_path ->
+ File.write!(pdf_path, bytes)
+
+ case System.cmd(binary, ["-layout", "-nopgbrk", pdf_path, txt_path], stderr_to_stdout: true) do
+ {_output, 0} ->
+ case File.read(txt_path) do
+ {:ok, text} ->
+ {:ok, String.trim(text)}
+
+ {:error, reason} ->
+ {:error,
+ Error.adapter_error("Failed to read extracted PDF text", %{error_code: :unavailable, reason: reason})}
+ end
+
+ {output, status} ->
+ {:error,
+ Error.adapter_error("pdftotext failed while extracting PDF", %{
+ error_code: :unavailable,
+ status: status,
+ output: output
+ })}
+ end
+ end)
+ end
+ end
+
+ defp pdftotext_path do
+ config(:pdftotext_path) || System.find_executable("pdftotext")
+ end
+
+ defp fetch_cached(url, opts) do
+ if opts[:cache] do
+ ensure_cache_table!()
+ now = System.system_time(:millisecond)
+
+ case :ets.lookup(@cache_table, cache_key(url, opts)) do
+ [{_key, expires_at, result}] ->
+ if expires_at > now do
+ {:ok, Map.put(result, :cached, true)}
+ else
+ :ets.delete(@cache_table, cache_key(url, opts))
+ :miss
+ end
+
+ [] ->
+ :miss
+ end
+ else
+ :miss
+ end
+ end
+
+ defp maybe_store_cache(url, opts, result) do
+ if opts[:cache] do
+ ensure_cache_table!()
+
+ expires_at = System.system_time(:millisecond) + max(opts[:cache_ttl_ms], 0)
+ :ets.insert(@cache_table, {cache_key(url, opts), expires_at, result})
+ end
+
+ :ok
+ end
+
+ defp ensure_cache_table! do
+ case :ets.whereis(@cache_table) do
+ :undefined ->
+ try do
+ :ets.new(@cache_table, [:named_table, :set, :public, read_concurrency: true, write_concurrency: true])
+ rescue
+ ArgumentError -> @cache_table
+ end
+
+ table ->
+ table
+ end
+ end
+
+ defp cache_key(url, opts) do
+ {:jido_browser_web_fetch, url, opts[:format], opts[:selector], opts[:allowed_domains], opts[:blocked_domains],
+ opts[:focus_terms], opts[:focus_window], opts[:max_content_tokens], opts[:citations]}
+ end
+
+ defp request_headers do
+ [
+ {"accept", "text/html,application/xhtml+xml,text/plain,application/pdf;q=0.9,*/*;q=0.1"},
+ {"user-agent", user_agent()}
+ ]
+ end
+
+ defp user_agent do
+ vsn =
+ case Application.spec(:jido_browser, :vsn) do
+ nil -> "dev"
+ value -> List.to_string(value)
+ end
+
+ "jido_browser/#{vsn}"
+ end
+
+ defp response_content_type(response) do
+ response
+ |> Req.Response.get_header("content-type")
+ |> List.first()
+ |> case do
+ nil -> infer_content_type(response.body)
+ content_type -> content_type |> String.split(";") |> hd() |> String.trim() |> String.downcase()
+ end
+ end
+
+ defp infer_content_type(body) when is_binary(body) do
+ if String.starts_with?(body, "%PDF-") do
+ "application/pdf"
+ else
+ "text/plain"
+ end
+ end
+
+ defp infer_content_type(_body), do: "application/octet-stream"
+
+ defp text_content_type?(content_type) do
+ content_type in @text_content_types or String.starts_with?(content_type, "text/")
+ end
+
+ defp retrieved_at do
+ DateTime.utc_now()
+ |> DateTime.truncate(:second)
+ |> DateTime.to_iso8601()
+ end
+
+ defp estimate_tokens(content) when is_binary(content) do
+ div(String.length(content) + 3, 4)
+ end
+
+ defp estimate_tokens(_content), do: 0
+
+ defp normalize_citations(%{enabled: enabled}), do: enabled == true
+ defp normalize_citations(enabled), do: enabled == true
+
+ defp present_domain_rules?(rules), do: rules not in [nil, []]
+
+ defp normalize_focus_terms(nil), do: []
+
+ defp normalize_focus_terms(terms) do
+ terms
+ |> List.wrap()
+ |> Enum.map(fn
+ term when is_binary(term) -> String.trim(term)
+ term -> to_string(term)
+ end)
+ |> Enum.reject(&(&1 == ""))
+ |> Enum.uniq()
+ end
+
+ defp normalize_known_url(url) when is_binary(url) do
+ url
+ |> String.trim()
+ |> case do
+ "" -> nil
+ value -> value
+ end
+ end
+
+ defp normalize_known_url(_), do: nil
+
+ defp normalize_uri(%URI{} = uri) do
+ %{uri | host: String.downcase(uri.host || ""), fragment: nil}
+ end
+
+ defp normalize_rule_path(nil), do: "/"
+ defp normalize_rule_path(""), do: "/"
+ defp normalize_rule_path(path), do: if(String.starts_with?(path, "/"), do: path, else: "/" <> path)
+
+ defp title_from_url(url) do
+ path = URI.parse(url).path || ""
+
+ case path do
+ "" -> nil
+ "/" -> nil
+ value -> value |> Path.basename() |> String.trim("/") |> blank_to_nil()
+ end
+ end
+
+ defp blank_to_nil(nil), do: nil
+ defp blank_to_nil(""), do: nil
+ defp blank_to_nil(value), do: value
+
+ defp ascii_only?(value) when is_binary(value) do
+ String.printable?(value) and String.match?(value, ~r/^[\x00-\x7F]+$/)
+ end
+
+ defp config(key, default \\ nil) do
+ :jido_browser
+ |> Application.get_env(:web_fetch, [])
+ |> Keyword.get(key, default)
+ end
+
+ defp with_tmp_files(prefix, first_suffix, second_suffix, fun) do
+ base = Path.join(System.tmp_dir!(), "#{prefix}_#{System.unique_integer([:positive])}")
+ first = base <> first_suffix
+ second = base <> second_suffix
+
+ try do
+ fun.(first, second)
+ after
+ File.rm(first)
+ File.rm(second)
+ end
+ end
+end
diff --git a/mix.exs b/mix.exs
index b7ce8ca..091d21a 100644
--- a/mix.exs
+++ b/mix.exs
@@ -69,6 +69,7 @@ defmodule Jido.Browser.MixProject do
{:req, "~> 0.5"},
{:jason, "~> 1.4"},
{:uniq, "~> 0.6"},
+ {:floki, "~> 0.38"},
{:html2markdown, "~> 0.3"},
# Dev/Test
@@ -111,7 +112,8 @@ defmodule Jido.Browser.MixProject do
Core: [
Jido.Browser,
Jido.Browser.Session,
- Jido.Browser.Plugin
+ Jido.Browser.Plugin,
+ Jido.Browser.WebFetch
],
Adapters: [
Jido.Browser.Adapter,
@@ -154,7 +156,8 @@ defmodule Jido.Browser.MixProject do
"Content Extraction": [
Jido.Browser.Actions.Snapshot,
Jido.Browser.Actions.Screenshot,
- Jido.Browser.Actions.ExtractContent
+ Jido.Browser.Actions.ExtractContent,
+ Jido.Browser.Actions.WebFetch
],
Advanced: [
Jido.Browser.Actions.Evaluate
diff --git a/test/jido_browser/composite_actions_and_installer_test.exs b/test/jido_browser/composite_actions_and_installer_test.exs
index dddf9d5..52b4633 100644
--- a/test/jido_browser/composite_actions_and_installer_test.exs
+++ b/test/jido_browser/composite_actions_and_installer_test.exs
@@ -5,6 +5,7 @@ defmodule Jido.Browser.CompositeActionsAndInstallerTest do
alias Jido.Browser.Actions.ReadPage
alias Jido.Browser.Actions.SearchWeb
alias Jido.Browser.Actions.SnapshotUrl
+ alias Jido.Browser.Actions.WebFetch
alias Jido.Browser.Installer
alias Jido.Browser.Session
@@ -176,6 +177,64 @@ defmodule Jido.Browser.CompositeActionsAndInstallerTest do
end
end
+ describe "WebFetch.run/2" do
+ test "passes provenance options through to the fetch API" do
+ expect(Jido.Browser, :web_fetch, fn "https://example.com/guide", opts ->
+ assert opts[:require_known_url] == true
+ assert "https://example.com/guide" in opts[:known_urls]
+ assert opts[:allowed_domains] == ["example.com"]
+
+ {:ok,
+ %{
+ url: "https://example.com/guide",
+ final_url: "https://example.com/guide",
+ title: "Guide",
+ content: "Fetched guide content",
+ format: :markdown,
+ content_type: "text/html",
+ document_type: :html,
+ retrieved_at: "2026-03-21T00:00:00Z",
+ estimated_tokens: 5,
+ original_estimated_tokens: 5,
+ truncated: false,
+ filtered: false,
+ focus_matches: 0,
+ cached: false,
+ citations: %{enabled: false},
+ passages: []
+ }}
+ end)
+
+ context = %{skill_state: %{seen_urls: ["https://example.com/guide"], web_fetch_uses: 0}}
+
+ assert {:ok, result} =
+ WebFetch.run(
+ %{
+ url: "https://example.com/guide",
+ require_known_url: true,
+ allowed_domains: ["example.com"]
+ },
+ context
+ )
+
+ assert result.status == "success"
+ assert result.url == "https://example.com/guide"
+ end
+
+ test "returns max_uses_exceeded before calling the fetch API" do
+ context = %{skill_state: %{web_fetch_uses: 2}}
+
+ assert {:error, error} =
+ WebFetch.run(
+ %{url: "https://example.com/guide", max_uses: 2},
+ context
+ )
+
+ assert %Jido.Browser.Error.InvalidError{} = error
+ assert error.details.error_code == :max_uses_exceeded
+ end
+ end
+
describe "Installer" do
test "target returns a supported platform atom" do
assert Installer.target() in [
diff --git a/test/jido_browser/plugin_test.exs b/test/jido_browser/plugin_test.exs
index 8636afd..ef16ef9 100644
--- a/test/jido_browser/plugin_test.exs
+++ b/test/jido_browser/plugin_test.exs
@@ -23,9 +23,9 @@ defmodule Jido.Browser.PluginTest do
assert "automation" in tags
end
- test "has 37 actions" do
+ test "has 38 actions" do
actions = Plugin.actions()
- assert length(actions) == 37
+ assert length(actions) == 38
end
test "includes all expected action modules" do
@@ -73,13 +73,14 @@ defmodule Jido.Browser.PluginTest do
# Advanced
assert Jido.Browser.Actions.Evaluate in actions
+ assert Jido.Browser.Actions.WebFetch in actions
end
end
describe "signal_routes/1" do
- test "returns 37 routes" do
+ test "returns 38 routes" do
routes = Plugin.signal_routes(%{})
- assert length(routes) == 37
+ assert length(routes) == 38
end
test "maps browser.navigate to Navigate action" do
@@ -122,6 +123,8 @@ defmodule Jido.Browser.PluginTest do
assert state.adapter == Jido.Browser.Adapters.AgentBrowser
assert state.last_url == nil
assert state.last_title == nil
+ assert state.seen_urls == []
+ assert state.web_fetch_uses == 0
end
test "accepts headless config override" do
@@ -154,7 +157,7 @@ defmodule Jido.Browser.PluginTest do
test "returns list of signal patterns" do
patterns = Plugin.signal_patterns()
assert is_list(patterns)
- assert length(patterns) == 37
+ assert length(patterns) == 38
end
test "all patterns have browser. prefix" do
@@ -173,6 +176,7 @@ defmodule Jido.Browser.PluginTest do
assert "browser.save_state" in patterns
assert "browser.tab_list" in patterns
assert "browser.console" in patterns
+ assert "browser.web_fetch" in patterns
end
end
@@ -188,6 +192,36 @@ defmodule Jido.Browser.PluginTest do
assert Plugin.transform_result(:some_action, result, %{}) == result
end
+ test "tracks discovered URLs and fetch usage for web fetch results" do
+ context = %{skill_state: %{seen_urls: ["https://seed.example"], web_fetch_uses: 1}}
+
+ result =
+ Plugin.transform_result(
+ Jido.Browser.Actions.WebFetch,
+ {:ok, %{url: "https://example.com", final_url: "https://example.com/final", status: "success"}},
+ context
+ )
+
+ assert {:ok, _result, state_updates} = result
+
+ assert Enum.sort(state_updates.seen_urls) ==
+ Enum.sort(["https://seed.example", "https://example.com", "https://example.com/final"])
+
+ assert state_updates.web_fetch_uses == 2
+ end
+
+ test "tracks URLs returned by search results" do
+ result =
+ Plugin.transform_result(
+ Jido.Browser.Actions.SearchWeb,
+ {:ok, %{results: [%{url: "https://elixir-lang.org"}]}},
+ %{skill_state: %{}}
+ )
+
+ assert {:ok, _result, state_updates} = result
+ assert state_updates.seen_urls == ["https://elixir-lang.org"]
+ end
+
test "enhances error results when session available" do
context = %{
skill_state: %{
diff --git a/test/jido_browser/web_fetch_test.exs b/test/jido_browser/web_fetch_test.exs
new file mode 100644
index 0000000..c871bd0
--- /dev/null
+++ b/test/jido_browser/web_fetch_test.exs
@@ -0,0 +1,166 @@
+defmodule Jido.Browser.WebFetchTest do
+ use ExUnit.Case, async: false
+ use Mimic
+
+ alias Jido.Browser.Error
+ alias Jido.Browser.WebFetch
+
+ setup :set_mimic_global
+
+ setup_all do
+ Mimic.copy(Req)
+ :ok
+ end
+
+ setup do
+ WebFetch.clear_cache()
+ :ok
+ end
+
+ describe "web_fetch/2" do
+ test "fetches HTML content with selector extraction and citation passages" do
+ expect(Req, :run, fn opts ->
+ assert opts[:url] == "https://example.com/article"
+
+ request = Req.Request.new(url: "https://example.com/article")
+
+ response =
+ %Req.Response{
+ status: 200,
+ headers: %{"content-type" => ["text/html; charset=utf-8"]},
+ body: """
+
+
Example Article
+
+
+
+ Hello
+ Alpha paragraph.
+ Beta paragraph.
+
+
+
+ """
+ }
+
+ {request, response}
+ end)
+
+ assert {:ok, result} =
+ Jido.Browser.web_fetch(
+ "https://example.com/article",
+ selector: "main",
+ format: :markdown,
+ citations: true
+ )
+
+ assert result.title == "Example Article"
+ assert result.document_type == :html
+ assert result.format == :markdown
+ assert result.content =~ "Hello"
+ assert result.content =~ "Alpha paragraph."
+ assert result.cached == false
+ assert result.citations.enabled == true
+ assert [%{start_char: 0, text: passage_text} | _] = result.passages
+ assert passage_text =~ "Hello"
+ end
+
+ test "applies focused filtering to plain text responses" do
+ expect(Req, :run, fn opts ->
+ request = Req.Request.new(url: opts[:url])
+
+ response =
+ %Req.Response{
+ status: 200,
+ headers: %{"content-type" => ["text/plain"]},
+ body: """
+ Intro section
+
+ The relevant paragraph mentions Elixir and OTP.
+
+ Closing section
+ """
+ }
+
+ {request, response}
+ end)
+
+ assert {:ok, result} =
+ Jido.Browser.web_fetch(
+ "https://example.com/notes.txt",
+ format: :text,
+ focus_terms: ["elixir"]
+ )
+
+ assert result.filtered == true
+ assert result.focus_matches == 1
+ assert result.content =~ "relevant paragraph"
+ refute result.content =~ "Intro section"
+ end
+
+ test "rejects URLs outside allowed_domains" do
+ assert {:error, %Error.InvalidError{details: %{error_code: :url_not_allowed}}} =
+ Jido.Browser.web_fetch(
+ "https://example.com/private",
+ allowed_domains: ["docs.example.com"]
+ )
+ end
+
+ test "enforces known URL provenance when requested" do
+ assert {:error, %Error.InvalidError{details: %{error_code: :url_not_allowed}}} =
+ Jido.Browser.web_fetch(
+ "https://example.com/private",
+ require_known_url: true,
+ known_urls: ["https://example.com/public"]
+ )
+ end
+
+ test "caps returned content by approximate token budget" do
+ expect(Req, :run, fn opts ->
+ request = Req.Request.new(url: opts[:url])
+
+ response =
+ %Req.Response{
+ status: 200,
+ headers: %{"content-type" => ["text/plain"]},
+ body: String.duplicate("abcdef", 20)
+ }
+
+ {request, response}
+ end)
+
+ assert {:ok, result} =
+ Jido.Browser.web_fetch(
+ "https://example.com/large.txt",
+ format: :text,
+ max_content_tokens: 5
+ )
+
+ assert result.truncated == true
+ assert result.original_estimated_tokens > 5
+ assert result.estimated_tokens <= 5
+ end
+
+ test "reuses cached responses for identical requests" do
+ expect(Req, :run, fn opts ->
+ request = Req.Request.new(url: opts[:url])
+
+ response =
+ %Req.Response{
+ status: 200,
+ headers: %{"content-type" => ["text/plain"]},
+ body: "cached content"
+ }
+
+ {request, response}
+ end)
+
+ assert {:ok, first} = Jido.Browser.web_fetch("https://example.com/cache.txt", format: :text)
+ assert {:ok, second} = Jido.Browser.web_fetch("https://example.com/cache.txt", format: :text)
+
+ assert first.cached == false
+ assert second.cached == true
+ assert first.content == second.content
+ end
+ end
+end
From 48c8632cab7000f0f080538da2553fce86d6ce3f Mon Sep 17 00:00:00 2001
From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com>
Date: Sat, 21 Mar 2026 20:00:41 -0500
Subject: [PATCH 2/7] chore: remove changelog entry from web fetch PR
---
CHANGELOG.md | 4 ----
1 file changed, 4 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 99820c3..115277a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,10 +7,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
-### Added
-
-- Add HTTP-first `Jido.Browser.web_fetch/2` and `Jido.Browser.Actions.WebFetch` for stateless page retrieval with domain policy, focused filtering, caching, and citation-ready passages
-
### Changed
- Rename the public Elixir namespace from `JidoBrowser.*` to `Jido.Browser.*`
From f57bdd25538d4f0e2cbcfbd3db8d22284606d5b0 Mon Sep 17 00:00:00 2001
From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com>
Date: Sat, 21 Mar 2026 20:01:51 -0500
Subject: [PATCH 3/7] chore: drop changelog newline diff
---
CHANGELOG.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 115277a..fb19c91 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -110,4 +110,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Refactoring:
-* streamline agent-browser runtime defaults by mikehostetler
+* streamline agent-browser runtime defaults by mikehostetler
\ No newline at end of file
From 5c674e69e2e543ea0ee2067c695a2e3badbd00ca Mon Sep 17 00:00:00 2001
From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com>
Date: Sat, 21 Mar 2026 20:15:02 -0500
Subject: [PATCH 4/7] feat: use extractous for web fetch documents
---
README.md | 7 +-
lib/jido_browser/actions/web_fetch.ex | 8 +-
lib/jido_browser/web_fetch.ex | 378 +++++++++++++++++++-------
mix.exs | 1 +
mix.lock | 4 +
test/jido_browser/web_fetch_test.exs | 97 +++++++
6 files changed, 387 insertions(+), 108 deletions(-)
diff --git a/README.md b/README.md
index c1d1808..590898a 100644
--- a/README.md
+++ b/README.md
@@ -94,6 +94,8 @@ result.content
result.passages
```
+`web_fetch/2` keeps HTML handling native for selector extraction and markdown conversion, and uses `extractous_ex` for fetched binary documents such as PDFs, Word, Excel, PowerPoint, OpenDocument, EPUB, and common email formats.
+
### State Persistence
```elixir
@@ -164,7 +166,10 @@ Optional web fetch settings:
```elixir
config :jido_browser, :web_fetch,
cache_ttl_ms: 300_000,
- pdftotext_path: "/usr/local/bin/pdftotext"
+ extractous: [
+ pdf: [extract_annotation_text: true],
+ office: [include_headers_and_footers: true]
+ ]
```
## Backends
diff --git a/lib/jido_browser/actions/web_fetch.ex b/lib/jido_browser/actions/web_fetch.ex
index 417f07d..7efcaae 100644
--- a/lib/jido_browser/actions/web_fetch.ex
+++ b/lib/jido_browser/actions/web_fetch.ex
@@ -1,17 +1,17 @@
defmodule Jido.Browser.Actions.WebFetch do
@moduledoc """
- Stateless HTTP-first page retrieval for agent workflows.
+ Stateless HTTP-first document retrieval for agent workflows.
`WebFetch` is a lighter-weight alternative to browser navigation when the
target content can be retrieved over plain HTTP(S) without JavaScript
- execution.
+ execution, including fetched PDFs and office-style documents.
"""
use Jido.Action,
name: "web_fetch",
description:
- "Fetch a URL over HTTP(S) with domain policy controls, optional focused filtering, " <>
- "approximate token caps, and citation-ready passages.",
+ "Fetch a URL over HTTP(S) with domain policy controls, Extractous-backed document extraction, " <>
+ "optional focused filtering, approximate token caps, and citation-ready passages.",
category: "Browser",
tags: ["browser", "web", "fetch", "http", "retrieval"],
vsn: "2.0.0",
diff --git a/lib/jido_browser/web_fetch.ex b/lib/jido_browser/web_fetch.ex
index 8599598..d94b51d 100644
--- a/lib/jido_browser/web_fetch.ex
+++ b/lib/jido_browser/web_fetch.ex
@@ -1,7 +1,8 @@
defmodule Jido.Browser.WebFetch do
@moduledoc """
Stateless HTTP-first web retrieval with optional domain policy, caching,
- focused filtering, and citation-ready passage metadata.
+ focused filtering, citation-ready passage metadata, and Extractous-backed
+ document extraction.
This module is intended for document retrieval workloads where starting a full
browser session would be unnecessary or too expensive.
@@ -16,8 +17,64 @@ defmodule Jido.Browser.WebFetch do
@default_max_url_length 2_048
@supported_formats [:markdown, :text, :html]
@html_content_types ["text/html", "application/xhtml+xml"]
- @text_content_types ["text/plain", "text/markdown", "text/csv", "text/xml", "application/xml"]
- @pdf_content_types ["application/pdf"]
+ @text_content_types [
+ "text/plain",
+ "text/markdown",
+ "text/csv",
+ "text/xml",
+ "application/xml",
+ "application/json",
+ "application/ld+json"
+ ]
+ @document_content_types %{
+ "application/pdf" => :pdf,
+ "application/msword" => :word_processing,
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => :word_processing,
+ "application/vnd.ms-word.document.macroenabled.12" => :word_processing,
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.template" => :word_processing,
+ "application/vnd.ms-word.template.macroenabled.12" => :word_processing,
+ "application/vnd.ms-excel" => :spreadsheet,
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => :spreadsheet,
+ "application/vnd.ms-excel.sheet.macroenabled.12" => :spreadsheet,
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.template" => :spreadsheet,
+ "application/vnd.ms-excel.template.macroenabled.12" => :spreadsheet,
+ "application/vnd.ms-powerpoint" => :presentation,
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation" => :presentation,
+ "application/vnd.ms-powerpoint.presentation.macroenabled.12" => :presentation,
+ "application/vnd.openxmlformats-officedocument.presentationml.slideshow" => :presentation,
+ "application/vnd.openxmlformats-officedocument.presentationml.template" => :presentation,
+ "application/vnd.oasis.opendocument.text" => :word_processing,
+ "application/vnd.oasis.opendocument.spreadsheet" => :spreadsheet,
+ "application/vnd.oasis.opendocument.presentation" => :presentation,
+ "application/rtf" => :word_processing,
+ "text/rtf" => :word_processing,
+ "application/epub+zip" => :ebook,
+ "message/rfc822" => :email,
+ "application/vnd.ms-outlook" => :email
+ }
+ @document_extensions %{
+ "pdf" => :pdf,
+ "doc" => :word_processing,
+ "docx" => :word_processing,
+ "docm" => :word_processing,
+ "dotx" => :word_processing,
+ "dotm" => :word_processing,
+ "odt" => :word_processing,
+ "rtf" => :word_processing,
+ "xls" => :spreadsheet,
+ "xlsx" => :spreadsheet,
+ "xlsm" => :spreadsheet,
+ "xlsb" => :spreadsheet,
+ "ods" => :spreadsheet,
+ "ppt" => :presentation,
+ "pptx" => :presentation,
+ "pptm" => :presentation,
+ "ppsx" => :presentation,
+ "odp" => :presentation,
+ "epub" => :ebook,
+ "eml" => :email,
+ "msg" => :email
+ }
@type result :: %{
required(:url) => String.t(),
@@ -35,7 +92,8 @@ defmodule Jido.Browser.WebFetch do
required(:cached) => boolean(),
required(:citations) => %{enabled: boolean()},
required(:passages) => list(map()),
- optional(:title) => String.t() | nil
+ optional(:title) => String.t() | nil,
+ optional(:metadata) => map()
}
@doc """
@@ -53,6 +111,7 @@ defmodule Jido.Browser.WebFetch do
- `:cache` - enable ETS cache, defaults to `true`
- `:cache_ttl_ms` - cache TTL in milliseconds
- `:require_known_url` / `:known_urls` - optional URL provenance guard
+ - `:extractous` - optional `ExtractousEx` keyword options merged with config
"""
@spec fetch(String.t(), keyword()) :: {:ok, result()} | {:error, Exception.t()}
def fetch(url, opts \\ [])
@@ -125,13 +184,14 @@ defmodule Jido.Browser.WebFetch do
defp build_result(url, final_url, response, opts) do
content_type = response_content_type(response)
+ document_type = extractable_document_type(content_type, final_url, response.body)
cond do
content_type in @html_content_types ->
build_html_result(url, final_url, response.body, content_type, opts)
- content_type in @pdf_content_types ->
- build_pdf_result(url, final_url, response.body, content_type, opts)
+ not is_nil(document_type) ->
+ build_document_result(url, final_url, response.body, content_type, document_type, opts)
text_content_type?(content_type) ->
build_text_result(url, final_url, response.body, content_type, opts)
@@ -221,11 +281,11 @@ defmodule Jido.Browser.WebFetch do
})}
end
- defp build_pdf_result(url, final_url, body, content_type, opts) when is_binary(body) do
+ defp build_document_result(url, final_url, body, content_type, document_type, opts) when is_binary(body) do
cond do
opts[:selector] ->
{:error,
- Error.invalid_error("Selector filtering is not supported for PDF content", %{
+ Error.invalid_error("Selector filtering is only supported for HTML content", %{
error_code: :invalid_input,
selector: opts[:selector],
content_type: content_type
@@ -233,14 +293,14 @@ defmodule Jido.Browser.WebFetch do
opts[:format] == :html ->
{:error,
- Error.invalid_error("HTML output is not supported for PDF content", %{
+ Error.invalid_error("HTML output is only supported for HTML content", %{
error_code: :invalid_input,
format: :html,
content_type: content_type
})}
true ->
- with {:ok, text} <- extract_pdf_text(body),
+ with {:ok, text, metadata} <- extract_document_content(body, final_url, content_type, document_type, opts),
{:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(text, opts),
{final_content, truncated, original_estimated_tokens} <-
maybe_truncate(filtered_content, opts[:max_content_tokens]) do
@@ -249,22 +309,23 @@ defmodule Jido.Browser.WebFetch do
url,
final_url,
final_content,
- title_from_url(final_url),
+ document_title(metadata, final_url),
content_type,
- :pdf,
+ document_type,
opts,
truncated,
filtered,
focus_matches,
- original_estimated_tokens
+ original_estimated_tokens,
+ metadata
)}
end
end
end
- defp build_pdf_result(_url, _final_url, body, content_type, _opts) do
+ defp build_document_result(_url, _final_url, body, content_type, _document_type, _opts) do
{:error,
- Error.adapter_error("Unexpected response body for PDF fetch", %{
+ Error.adapter_error("Unexpected response body for document fetch", %{
error_code: :unavailable,
content_type: content_type,
body: body
@@ -282,7 +343,8 @@ defmodule Jido.Browser.WebFetch do
truncated,
filtered,
focus_matches,
- original_estimated_tokens
+ original_estimated_tokens,
+ metadata \\ nil
) do
passages = maybe_build_passages(content, title, final_url, opts[:citations])
@@ -304,6 +366,7 @@ defmodule Jido.Browser.WebFetch do
citations: %{enabled: opts[:citations]},
passages: passages
}
+ |> maybe_put_metadata(metadata)
end
defp normalize_opts(opts) do
@@ -311,42 +374,46 @@ defmodule Jido.Browser.WebFetch do
citations = normalize_citations(opts[:citations])
focus_terms = normalize_focus_terms(opts[:focus_terms])
- cond do
- format not in @supported_formats ->
- {:error,
- Error.invalid_error("Unsupported web fetch format", %{
- error_code: :invalid_input,
- format: format,
- supported_formats: @supported_formats
- })}
+ with {:ok, configured_extractous_opts} <- normalize_extractous_opts(config(:extractous, [])),
+ {:ok, request_extractous_opts} <- normalize_extractous_opts(Keyword.get(opts, :extractous, [])) do
+ cond do
+ format not in @supported_formats ->
+ {:error,
+ Error.invalid_error("Unsupported web fetch format", %{
+ error_code: :invalid_input,
+ format: format,
+ supported_formats: @supported_formats
+ })}
- present_domain_rules?(opts[:allowed_domains]) and present_domain_rules?(opts[:blocked_domains]) ->
- {:error,
- Error.invalid_error("Use either allowed_domains or blocked_domains, not both", %{
- error_code: :invalid_input
- })}
+ present_domain_rules?(opts[:allowed_domains]) and present_domain_rules?(opts[:blocked_domains]) ->
+ {:error,
+ Error.invalid_error("Use either allowed_domains or blocked_domains, not both", %{
+ error_code: :invalid_input
+ })}
- format == :html and focus_terms != [] ->
- {:error,
- Error.invalid_error("Focused filtering is only supported for markdown and text output", %{
- error_code: :invalid_input,
- format: format
- })}
+ format == :html and focus_terms != [] ->
+ {:error,
+ Error.invalid_error("Focused filtering is only supported for markdown and text output", %{
+ error_code: :invalid_input,
+ format: format
+ })}
- true ->
- normalized =
- opts
- |> Keyword.put(:format, format)
- |> Keyword.put(:citations, citations)
- |> Keyword.put(:focus_terms, focus_terms)
- |> Keyword.put_new(:focus_window, 0)
- |> Keyword.put_new(:timeout, config(:timeout, @default_timeout))
- |> Keyword.put_new(:max_redirects, @default_max_redirects)
- |> Keyword.put_new(:cache, true)
- |> Keyword.put_new(:cache_ttl_ms, config(:cache_ttl_ms, @default_cache_ttl_ms))
- |> Keyword.put_new(:known_urls, [])
-
- {:ok, normalized}
+ true ->
+ normalized =
+ opts
+ |> Keyword.put(:format, format)
+ |> Keyword.put(:citations, citations)
+ |> Keyword.put(:focus_terms, focus_terms)
+ |> Keyword.put(:extractous, merge_extractous_opts(configured_extractous_opts, request_extractous_opts))
+ |> Keyword.put_new(:focus_window, 0)
+ |> Keyword.put_new(:timeout, config(:timeout, @default_timeout))
+ |> Keyword.put_new(:max_redirects, @default_max_redirects)
+ |> Keyword.put_new(:cache, true)
+ |> Keyword.put_new(:cache_ttl_ms, config(:cache_ttl_ms, @default_cache_ttl_ms))
+ |> Keyword.put_new(:known_urls, [])
+
+ {:ok, normalized}
+ end
end
end
@@ -694,44 +761,44 @@ defmodule Jido.Browser.WebFetch do
end
end
- defp extract_pdf_text(bytes) do
- case pdftotext_path() do
- nil ->
+ defp extract_document_content(bytes, final_url, content_type, document_type, opts) do
+ case ExtractousEx.extract_from_bytes(bytes, opts[:extractous]) do
+ {:ok, %{content: content, metadata: metadata}} when is_binary(content) ->
+ {:ok, String.trim(content), normalize_metadata(metadata)}
+
+ {:ok, %{content: content}} when is_binary(content) ->
+ {:ok, String.trim(content), %{}}
+
+ {:ok, result} ->
{:error,
- Error.adapter_error("PDF extraction requires pdftotext to be installed", %{
- error_code: :unsupported_content_type,
- content_type: "application/pdf"
+ Error.adapter_error("ExtractousEx returned an unexpected document payload", %{
+ error_code: :unavailable,
+ url: final_url,
+ content_type: content_type,
+ document_type: document_type,
+ result: result
})}
- binary ->
- with_tmp_files("jido_browser_web_fetch", ".pdf", ".txt", fn pdf_path, txt_path ->
- File.write!(pdf_path, bytes)
-
- case System.cmd(binary, ["-layout", "-nopgbrk", pdf_path, txt_path], stderr_to_stdout: true) do
- {_output, 0} ->
- case File.read(txt_path) do
- {:ok, text} ->
- {:ok, String.trim(text)}
-
- {:error, reason} ->
- {:error,
- Error.adapter_error("Failed to read extracted PDF text", %{error_code: :unavailable, reason: reason})}
- end
-
- {output, status} ->
- {:error,
- Error.adapter_error("pdftotext failed while extracting PDF", %{
- error_code: :unavailable,
- status: status,
- output: output
- })}
- end
- end)
+ {:error, reason} ->
+ {:error,
+ Error.adapter_error("ExtractousEx failed while extracting document content", %{
+ error_code: :unavailable,
+ url: final_url,
+ content_type: content_type,
+ document_type: document_type,
+ reason: reason
+ })}
end
- end
-
- defp pdftotext_path do
- config(:pdftotext_path) || System.find_executable("pdftotext")
+ rescue
+ error ->
+ {:error,
+ Error.adapter_error("ExtractousEx failed while extracting document content", %{
+ error_code: :unavailable,
+ url: final_url,
+ content_type: content_type,
+ document_type: document_type,
+ reason: error
+ })}
end
defp fetch_cached(url, opts) do
@@ -783,12 +850,16 @@ defmodule Jido.Browser.WebFetch do
defp cache_key(url, opts) do
{:jido_browser_web_fetch, url, opts[:format], opts[:selector], opts[:allowed_domains], opts[:blocked_domains],
- opts[:focus_terms], opts[:focus_window], opts[:max_content_tokens], opts[:citations]}
+ opts[:focus_terms], opts[:focus_window], opts[:max_content_tokens], opts[:citations], opts[:extractous]}
end
defp request_headers do
[
- {"accept", "text/html,application/xhtml+xml,text/plain,application/pdf;q=0.9,*/*;q=0.1"},
+ {"accept",
+ "text/html,application/xhtml+xml,text/plain,application/json,application/pdf," <>
+ "application/msword,application/vnd.openxmlformats-officedocument.wordprocessingml.document," <>
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet," <>
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation,*/*;q=0.1"},
{"user-agent", user_agent()}
]
end
@@ -814,10 +885,15 @@ defmodule Jido.Browser.WebFetch do
end
defp infer_content_type(body) when is_binary(body) do
- if String.starts_with?(body, "%PDF-") do
- "application/pdf"
- else
- "text/plain"
+ cond do
+ String.starts_with?(body, "%PDF-") ->
+ "application/pdf"
+
+ likely_text?(body) ->
+ "text/plain"
+
+ true ->
+ "application/octet-stream"
end
end
@@ -857,6 +933,38 @@ defmodule Jido.Browser.WebFetch do
|> Enum.uniq()
end
+ defp normalize_extractous_opts(nil), do: {:ok, []}
+
+ defp normalize_extractous_opts(opts) when is_list(opts) do
+ if Keyword.keyword?(opts) do
+ {:ok, opts}
+ else
+ {:error,
+ Error.invalid_error("Extractous options must be a keyword list", %{
+ error_code: :invalid_input,
+ extractous: opts
+ })}
+ end
+ end
+
+ defp normalize_extractous_opts(opts) do
+ {:error,
+ Error.invalid_error("Extractous options must be a keyword list", %{
+ error_code: :invalid_input,
+ extractous: opts
+ })}
+ end
+
+ defp merge_extractous_opts(left, right) do
+ Keyword.merge(left, right, fn _key, left_value, right_value ->
+ if Keyword.keyword?(left_value) and Keyword.keyword?(right_value) do
+ merge_extractous_opts(left_value, right_value)
+ else
+ right_value
+ end
+ end)
+ end
+
defp normalize_known_url(url) when is_binary(url) do
url
|> String.trim()
@@ -876,6 +984,68 @@ defmodule Jido.Browser.WebFetch do
defp normalize_rule_path(""), do: "/"
defp normalize_rule_path(path), do: if(String.starts_with?(path, "/"), do: path, else: "/" <> path)
+ defp extractable_document_type(content_type, final_url, body) do
+ Map.get(@document_content_types, content_type) ||
+ infer_document_type_from_body(body) ||
+ if(ambiguous_binary_content_type?(content_type), do: infer_document_type_from_url(final_url), else: nil)
+ end
+
+ defp infer_document_type_from_url(url) do
+ url
+ |> URI.parse()
+ |> Map.get(:path, "")
+ |> Path.extname()
+ |> String.trim_leading(".")
+ |> String.downcase()
+ |> case do
+ "" -> nil
+ extension -> Map.get(@document_extensions, extension)
+ end
+ end
+
+ defp infer_document_type_from_body(body) when is_binary(body) do
+ if String.starts_with?(body, "%PDF-"), do: :pdf, else: nil
+ end
+
+ defp infer_document_type_from_body(_body), do: nil
+
+ defp document_title(metadata, url) do
+ metadata
+ |> metadata_title()
+ |> blank_to_nil()
+ |> case do
+ nil -> title_from_url(url)
+ title -> title
+ end
+ end
+
+ defp metadata_title(metadata) when is_map(metadata) do
+ Enum.find_value([:title, "title", "dc:title", :"dc:title"], fn key ->
+ metadata
+ |> Map.get(key)
+ |> metadata_value_to_string()
+ |> blank_to_nil()
+ end)
+ end
+
+ defp metadata_title(_metadata), do: nil
+
+ defp metadata_value_to_string(nil), do: nil
+ defp metadata_value_to_string(value) when is_binary(value), do: String.trim(value)
+
+ defp metadata_value_to_string(value) when is_list(value),
+ do: value |> Enum.map_join(" ", &to_string/1) |> String.trim()
+
+ defp metadata_value_to_string(value) when is_atom(value), do: value |> Atom.to_string() |> String.trim()
+ defp metadata_value_to_string(value) when is_number(value), do: value |> to_string() |> String.trim()
+ defp metadata_value_to_string(_value), do: nil
+
+ defp normalize_metadata(metadata) when is_map(metadata), do: metadata
+ defp normalize_metadata(_metadata), do: %{}
+
+ defp maybe_put_metadata(response, metadata) when metadata in [%{}, nil], do: response
+ defp maybe_put_metadata(response, metadata), do: Map.put(response, :metadata, metadata)
+
defp title_from_url(url) do
path = URI.parse(url).path || ""
@@ -894,22 +1064,24 @@ defmodule Jido.Browser.WebFetch do
String.printable?(value) and String.match?(value, ~r/^[\x00-\x7F]+$/)
end
- defp config(key, default \\ nil) do
- :jido_browser
- |> Application.get_env(:web_fetch, [])
- |> Keyword.get(key, default)
+ defp ambiguous_binary_content_type?(content_type) do
+ content_type in [
+ "application/octet-stream",
+ "binary/octet-stream",
+ "application/download",
+ "application/x-download",
+ "application/zip",
+ "application/x-zip-compressed"
+ ]
end
- defp with_tmp_files(prefix, first_suffix, second_suffix, fun) do
- base = Path.join(System.tmp_dir!(), "#{prefix}_#{System.unique_integer([:positive])}")
- first = base <> first_suffix
- second = base <> second_suffix
+ defp likely_text?(body) when is_binary(body) do
+ String.valid?(body) and not String.contains?(body, <<0>>)
+ end
- try do
- fun.(first, second)
- after
- File.rm(first)
- File.rm(second)
- end
+ defp config(key, default) do
+ :jido_browser
+ |> Application.get_env(:web_fetch, [])
+ |> Keyword.get(key, default)
end
end
diff --git a/mix.exs b/mix.exs
index 091d21a..9fad805 100644
--- a/mix.exs
+++ b/mix.exs
@@ -71,6 +71,7 @@ defmodule Jido.Browser.MixProject do
{:uniq, "~> 0.6"},
{:floki, "~> 0.38"},
{:html2markdown, "~> 0.3"},
+ {:extractous_ex, "~> 0.2"},
# Dev/Test
{:credo, "~> 1.7", only: [:dev, :test], runtime: false},
diff --git a/mix.lock b/mix.lock
index aaf36a7..e91947c 100644
--- a/mix.lock
+++ b/mix.lock
@@ -1,6 +1,7 @@
%{
"abacus": {:hex, :abacus, "2.1.0", "b6db5c989ba3d9dd8c36d1cb269e2f0058f34768d47c67eb8ce06697ecb36dd4", [:mix], [], "hexpm", "255de08b02884e8383f1eed8aa31df884ce0fb5eb394db81ff888089f2a1bbff"},
"bunt": {:hex, :bunt, "1.0.0", "081c2c665f086849e6d57900292b3a161727ab40431219529f13c4ddcf3e7a44", [:mix], [], "hexpm", "dc5f86aa08a5f6fa6b8096f0735c4e76d54ae5c9fa2c143e5a1fc7c1cd9bb6b5"},
+ "castore": {:hex, :castore, "1.0.18", "5e43ef0ec7d31195dfa5a65a86e6131db999d074179d2ba5a8de11fe14570f55", [:mix], [], "hexpm", "f393e4fe6317829b158fb74d86eb681f737d2fe326aa61ccf6293c4104957e34"},
"certifi": {:hex, :certifi, "2.15.0", "0e6e882fcdaaa0a5a9f2b3db55b1394dba07e8d6d9bcad08318fb604c6839712", [:rebar3], [], "hexpm", "b147ed22ce71d72eafdad94f055165c1c182f61a2ff49df28bcc71d1d5b94a60"},
"credo": {:hex, :credo, "1.7.17", "f92b6aa5b26301eaa5a35e4d48ebf5aa1e7094ac00ae38f87086c562caf8a22f", [:mix], [{:bunt, "~> 0.2.1 or ~> 1.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2 or ~> 1.0", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "1eb5645c835f0b6c9b5410f94b5a185057bcf6d62a9c2b476da971cde8749645"},
"crontab": {:hex, :crontab, "1.2.0", "503611820257939d5d0fd272eb2b454f48a470435a809479ddc2c40bb515495c", [:mix], [{:ecto, "~> 1.0 or ~> 2.0 or ~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm", "ebd7ef4d831e1b20fa4700f0de0284a04cac4347e813337978e25b4cc5cc2207"},
@@ -12,6 +13,7 @@
"erlex": {:hex, :erlex, "0.2.8", "cd8116f20f3c0afe376d1e8d1f0ae2452337729f68be016ea544a72f767d9c12", [:mix], [], "hexpm", "9d66ff9fedf69e49dc3fd12831e12a8a37b76f8651dd21cd45fcf5561a8a7590"},
"ex_doc": {:hex, :ex_doc, "0.40.1", "67542e4b6dde74811cfd580e2c0149b78010fd13001fda7cfeb2b2c2ffb1344d", [:mix], [{:earmark_parser, "~> 1.4.44", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "bcef0e2d360d93ac19f01a85d58f91752d930c0a30e2681145feea6bd3516e00"},
"excoveralls": {:hex, :excoveralls, "0.18.5", "e229d0a65982613332ec30f07940038fe451a2e5b29bce2a5022165f0c9b157e", [:mix], [{:castore, "~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "523fe8a15603f86d64852aab2abe8ddbd78e68579c8525ae765facc5eae01562"},
+ "extractous_ex": {:hex, :extractous_ex, "0.2.1", "c9f7fd58b1d3b0d7eda9e219b1ed534a5b25e485884405d3ceee878e67248df2", [:mix], [{:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}, {:rustler, "~> 0.37", [hex: :rustler, repo: "hexpm", optional: false]}, {:rustler_precompiled, "~> 0.7", [hex: :rustler_precompiled, repo: "hexpm", optional: false]}], "hexpm", "8c1a3c74105448545a8478c3610fc920b2da418d47eae656853dc3e881adebd0"},
"file_system": {:hex, :file_system, "1.1.1", "31864f4685b0148f25bd3fbef2b1228457c0c89024ad67f7a81a3ffbc0bbad3a", [:mix], [], "hexpm", "7a15ff97dfe526aeefb090a7a9d3d03aa907e100e262a0f8f7746b78f8f87a5d"},
"finch": {:hex, :finch, "0.21.0", "b1c3b2d48af02d0c66d2a9ebfb5622be5c5ecd62937cf79a88a7f98d48a8290c", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "87dc6e169794cb2570f75841a19da99cfde834249568f2a5b121b809588a4377"},
"floki": {:hex, :floki, "0.38.0", "62b642386fa3f2f90713f6e231da0fa3256e41ef1089f83b6ceac7a3fd3abf33", [:mix], [], "hexpm", "a5943ee91e93fb2d635b612caf5508e36d37548e84928463ef9dd986f0d1abd9"},
@@ -51,6 +53,8 @@
"private": {:hex, :private, "0.1.2", "da4add9f36c3818a9f849840ca43016c8ae7f76d7a46c3b2510f42dcc5632932", [:mix], [], "hexpm", "22ee01c3f450cf8d135da61e10ec59dde006238fab1ea039014791fc8f3ff075"},
"recase": {:hex, :recase, "0.8.1", "ab98cd35857a86fa5ca99036f575241d71d77d9c2ab0c39aacf1c9b61f6f7d1d", [:mix], [], "hexpm", "9fd8d63e7e43bd9ea385b12364e305778b2bbd92537e95c4b2e26fc507d5e4c2"},
"req": {:hex, :req, "0.5.17", "0096ddd5b0ed6f576a03dde4b158a0c727215b15d2795e59e0916c6971066ede", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "0b8bc6ffdfebbc07968e59d3ff96d52f2202d0536f10fef4dc11dc02a2a43e39"},
+ "rustler": {:hex, :rustler, "0.37.3", "5f4e6634d43b26f0a69834dd1d3ed4e1710b022a053bf4a670220c9540c92602", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "a6872c6f53dcf00486d1e7f9e046e20e01bf1654bdacc4193016c2e8002b32a2"},
+ "rustler_precompiled": {:hex, :rustler_precompiled, "0.8.4", "700a878312acfac79fb6c572bb8b57f5aae05fe1cf70d34b5974850bbf2c05bf", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "3b33d99b540b15f142ba47944f7a163a25069f6d608783c321029bc1ffb09514"},
"splode": {:hex, :splode, "0.3.0", "ff8effecc509a51245df2f864ec78d849248647c37a75886033e3b1a53ca9470", [:mix], [], "hexpm", "73cfd0892d7316d6f2c93e6e8784bd6e137b2aa38443de52fd0a25171d106d81"},
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"},
"telemetry": {:hex, :telemetry, "1.4.1", "ab6de178e2b29b58e8256b92b382ea3f590a47152ca3651ea857a6cae05ac423", [:rebar3], [], "hexpm", "2172e05a27531d3d31dd9782841065c50dd5c3c7699d95266b2edd54c2dafa1c"},
diff --git a/test/jido_browser/web_fetch_test.exs b/test/jido_browser/web_fetch_test.exs
index c871bd0..43aad25 100644
--- a/test/jido_browser/web_fetch_test.exs
+++ b/test/jido_browser/web_fetch_test.exs
@@ -9,6 +9,7 @@ defmodule Jido.Browser.WebFetchTest do
setup_all do
Mimic.copy(Req)
+ Mimic.copy(ExtractousEx)
:ok
end
@@ -98,6 +99,102 @@ defmodule Jido.Browser.WebFetchTest do
refute result.content =~ "Intro section"
end
+ test "extracts PDF content through ExtractousEx and preserves metadata" do
+ pdf_bytes = "%PDF-1.7 fake"
+
+ expect(Req, :run, fn opts ->
+ request = Req.Request.new(url: opts[:url])
+
+ response =
+ %Req.Response{
+ status: 200,
+ headers: %{"content-type" => ["application/pdf"]},
+ body: pdf_bytes
+ }
+
+ {request, response}
+ end)
+
+ expect(ExtractousEx, :extract_from_bytes, fn ^pdf_bytes, opts ->
+ assert opts == []
+
+ {:ok,
+ %{
+ content: "Extracted PDF body",
+ metadata: %{"title" => "Quarterly Report", "author" => "Ops"}
+ }}
+ end)
+
+ assert {:ok, result} =
+ Jido.Browser.web_fetch(
+ "https://example.com/reports/q1.pdf",
+ format: :text,
+ citations: true
+ )
+
+ assert result.title == "Quarterly Report"
+ assert result.document_type == :pdf
+ assert result.content_type == "application/pdf"
+ assert result.content == "Extracted PDF body"
+ assert result.metadata == %{"title" => "Quarterly Report", "author" => "Ops"}
+ assert result.citations.enabled == true
+ assert [%{text: "Extracted PDF body"}] = result.passages
+ end
+
+ test "extracts office documents served as octet-stream based on file extension" do
+ docx_bytes = <<80, 75, 3, 4, 20, 0, 0, 0>>
+
+ expect(Req, :run, fn opts ->
+ request = Req.Request.new(url: opts[:url])
+
+ response =
+ %Req.Response{
+ status: 200,
+ headers: %{"content-type" => ["application/octet-stream"]},
+ body: docx_bytes
+ }
+
+ {request, response}
+ end)
+
+ expect(ExtractousEx, :extract_from_bytes, fn ^docx_bytes, opts ->
+ assert opts == []
+ {:ok, %{content: "DOCX body", metadata: %{}}}
+ end)
+
+ assert {:ok, result} =
+ Jido.Browser.web_fetch("https://example.com/specs/design.docx", format: :markdown)
+
+ assert result.title == "design.docx"
+ assert result.document_type == :word_processing
+ assert result.content_type == "application/octet-stream"
+ assert result.content == "DOCX body"
+ end
+
+ test "returns an adapter error when ExtractousEx extraction fails" do
+ pdf_bytes = "%PDF-1.7 broken"
+
+ expect(Req, :run, fn opts ->
+ request = Req.Request.new(url: opts[:url])
+
+ response =
+ %Req.Response{
+ status: 200,
+ headers: %{"content-type" => ["application/pdf"]},
+ body: pdf_bytes
+ }
+
+ {request, response}
+ end)
+
+ expect(ExtractousEx, :extract_from_bytes, fn ^pdf_bytes, [] ->
+ {:error, "parse failed"}
+ end)
+
+ assert {:error, %Error.AdapterError{details: %{error_code: :unavailable, document_type: :pdf}}} =
+ Jido.Browser.web_fetch("https://example.com/broken.pdf", format: :text)
+ end
+
test "rejects URLs outside allowed_domains" do
assert {:error, %Error.InvalidError{details: %{error_code: :url_not_allowed}}} =
Jido.Browser.web_fetch(
From 909aad680a0e5b0d63f159a0648ed2770f5b5de3 Mon Sep 17 00:00:00 2001
From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com>
Date: Sat, 21 Mar 2026 20:17:46 -0500
Subject: [PATCH 5/7] refactor: harden web fetch normalization
---
README.md | 5 +-
lib/jido_browser/web_fetch.ex | 242 +++++++++++++++++----------
test/jido_browser/web_fetch_test.exs | 30 ++++
3 files changed, 185 insertions(+), 92 deletions(-)
diff --git a/README.md b/README.md
index 590898a..f6f78bd 100644
--- a/README.md
+++ b/README.md
@@ -92,9 +92,10 @@ Selectors remain supported, but ref-based interaction is the preferred 2.0 flow:
result.content
result.passages
+result.metadata
```
-`web_fetch/2` keeps HTML handling native for selector extraction and markdown conversion, and uses `extractous_ex` for fetched binary documents such as PDFs, Word, Excel, PowerPoint, OpenDocument, EPUB, and common email formats.
+`web_fetch/2` keeps HTML handling native for selector extraction and markdown conversion, and uses `extractous_ex` for fetched binary documents such as PDFs, Word, Excel, PowerPoint, OpenDocument, EPUB, and common email formats. Binary document responses may also include `result.metadata` when extraction returns document metadata.
### State Persistence
@@ -172,6 +173,8 @@ config :jido_browser, :web_fetch,
]
```
+Configured `extractous` options are merged with any per-call `extractous:` keyword options passed to `Jido.Browser.web_fetch/2`.
+
## Backends
### AgentBrowser (Default)
diff --git a/lib/jido_browser/web_fetch.ex b/lib/jido_browser/web_fetch.ex
index d94b51d..5c5cf81 100644
--- a/lib/jido_browser/web_fetch.ex
+++ b/lib/jido_browser/web_fetch.ex
@@ -153,6 +153,7 @@ defmodule Jido.Browser.WebFetch do
url: url,
headers: request_headers(),
receive_timeout: opts[:timeout],
+ decode_body: false,
redirect: true,
max_redirects: opts[:max_redirects]
]
@@ -211,24 +212,8 @@ defmodule Jido.Browser.WebFetch do
with {:ok, document} <- parse_document(body),
{:ok, html} <- select_html(document, body, selector),
{:ok, title} <- extract_title(document),
- {:ok, content} <- format_html(html, opts[:format], opts),
- {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts),
- {final_content, truncated, original_estimated_tokens} <-
- maybe_truncate(filtered_content, opts[:max_content_tokens]) do
- {:ok,
- build_response(
- url,
- final_url,
- final_content,
- title,
- content_type,
- :html,
- opts,
- truncated,
- filtered,
- focus_matches,
- original_estimated_tokens
- )}
+ {:ok, content} <- format_html(html, opts[:format], opts) do
+ finalize_result(url, final_url, content, title, content_type, :html, opts)
end
end
@@ -242,33 +227,9 @@ defmodule Jido.Browser.WebFetch do
end
defp build_text_result(url, final_url, body, content_type, opts) when is_binary(body) do
- if opts[:selector] do
- {:error,
- Error.invalid_error("Selector filtering is only supported for HTML content", %{
- error_code: :invalid_input,
- selector: opts[:selector],
- content_type: content_type
- })}
- else
- with {:ok, content} <- format_text(body, opts[:format]),
- {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts),
- {final_content, truncated, original_estimated_tokens} <-
- maybe_truncate(filtered_content, opts[:max_content_tokens]) do
- {:ok,
- build_response(
- url,
- final_url,
- final_content,
- nil,
- content_type,
- :text,
- opts,
- truncated,
- filtered,
- focus_matches,
- original_estimated_tokens
- )}
- end
+ with :ok <- validate_non_html_options(content_type, opts),
+ {:ok, content} <- format_text(body, opts[:format]) do
+ finalize_result(url, final_url, content, nil, content_type, :text, opts)
end
end
@@ -282,44 +243,18 @@ defmodule Jido.Browser.WebFetch do
end
defp build_document_result(url, final_url, body, content_type, document_type, opts) when is_binary(body) do
- cond do
- opts[:selector] ->
- {:error,
- Error.invalid_error("Selector filtering is only supported for HTML content", %{
- error_code: :invalid_input,
- selector: opts[:selector],
- content_type: content_type
- })}
-
- opts[:format] == :html ->
- {:error,
- Error.invalid_error("HTML output is only supported for HTML content", %{
- error_code: :invalid_input,
- format: :html,
- content_type: content_type
- })}
-
- true ->
- with {:ok, text, metadata} <- extract_document_content(body, final_url, content_type, document_type, opts),
- {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(text, opts),
- {final_content, truncated, original_estimated_tokens} <-
- maybe_truncate(filtered_content, opts[:max_content_tokens]) do
- {:ok,
- build_response(
- url,
- final_url,
- final_content,
- document_title(metadata, final_url),
- content_type,
- document_type,
- opts,
- truncated,
- filtered,
- focus_matches,
- original_estimated_tokens,
- metadata
- )}
- end
+ with :ok <- validate_non_html_options(content_type, opts),
+ {:ok, text, metadata} <- extract_document_content(body, final_url, content_type, document_type, opts) do
+ finalize_result(
+ url,
+ final_url,
+ text,
+ document_title(metadata, final_url),
+ content_type,
+ document_type,
+ opts,
+ metadata
+ )
end
end
@@ -344,7 +279,7 @@ defmodule Jido.Browser.WebFetch do
filtered,
focus_matches,
original_estimated_tokens,
- metadata \\ nil
+ metadata
) do
passages = maybe_build_passages(content, title, final_url, opts[:citations])
@@ -369,13 +304,76 @@ defmodule Jido.Browser.WebFetch do
|> maybe_put_metadata(metadata)
end
+ defp finalize_result(url, final_url, content, title, content_type, document_type, opts, metadata \\ nil) do
+ with {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts),
+ {final_content, truncated, original_estimated_tokens} <-
+ maybe_truncate(filtered_content, opts[:max_content_tokens]) do
+ {:ok,
+ build_response(
+ url,
+ final_url,
+ final_content,
+ title,
+ content_type,
+ document_type,
+ opts,
+ truncated,
+ filtered,
+ focus_matches,
+ original_estimated_tokens,
+ metadata
+ )}
+ end
+ end
+
+ defp validate_non_html_options(content_type, opts) do
+ cond do
+ opts[:selector] ->
+ {:error,
+ Error.invalid_error("Selector filtering is only supported for HTML content", %{
+ error_code: :invalid_input,
+ selector: opts[:selector],
+ content_type: content_type
+ })}
+
+ opts[:format] == :html ->
+ {:error,
+ Error.invalid_error("HTML output is only supported for HTML content", %{
+ error_code: :invalid_input,
+ format: :html,
+ content_type: content_type
+ })}
+
+ true ->
+ :ok
+ end
+ end
+
defp normalize_opts(opts) do
format = opts[:format] || :markdown
citations = normalize_citations(opts[:citations])
focus_terms = normalize_focus_terms(opts[:focus_terms])
with {:ok, configured_extractous_opts} <- normalize_extractous_opts(config(:extractous, [])),
- {:ok, request_extractous_opts} <- normalize_extractous_opts(Keyword.get(opts, :extractous, [])) do
+ {:ok, request_extractous_opts} <- normalize_extractous_opts(Keyword.get(opts, :extractous, [])),
+ {:ok, selector} <- normalize_selector(opts[:selector]),
+ {:ok, focus_window} <- normalize_integer_opt(:focus_window, Keyword.get(opts, :focus_window, 0), min: 0),
+ {:ok, timeout} <-
+ normalize_integer_opt(:timeout, Keyword.get(opts, :timeout, config(:timeout, @default_timeout)), min: 1),
+ {:ok, max_redirects} <-
+ normalize_integer_opt(:max_redirects, Keyword.get(opts, :max_redirects, @default_max_redirects), min: 0),
+ {:ok, cache_ttl_ms} <-
+ normalize_integer_opt(
+ :cache_ttl_ms,
+ Keyword.get(opts, :cache_ttl_ms, config(:cache_ttl_ms, @default_cache_ttl_ms)),
+ min: 0
+ ),
+ {:ok, max_content_tokens} <-
+ normalize_optional_integer_opt(:max_content_tokens, opts[:max_content_tokens], min: 1),
+ {:ok, max_url_length} <- normalize_optional_integer_opt(:max_url_length, opts[:max_url_length], min: 1),
+ {:ok, cache} <- normalize_boolean_opt(:cache, Keyword.get(opts, :cache, true)),
+ {:ok, require_known_url} <-
+ normalize_boolean_opt(:require_known_url, Keyword.get(opts, :require_known_url, false)) do
cond do
format not in @supported_formats ->
{:error,
@@ -402,14 +400,18 @@ defmodule Jido.Browser.WebFetch do
normalized =
opts
|> Keyword.put(:format, format)
+ |> Keyword.put(:selector, selector)
|> Keyword.put(:citations, citations)
|> Keyword.put(:focus_terms, focus_terms)
+ |> Keyword.put(:focus_window, focus_window)
+ |> Keyword.put(:timeout, timeout)
+ |> Keyword.put(:max_redirects, max_redirects)
+ |> Keyword.put(:cache, cache)
+ |> Keyword.put(:cache_ttl_ms, cache_ttl_ms)
+ |> Keyword.put(:require_known_url, require_known_url)
|> Keyword.put(:extractous, merge_extractous_opts(configured_extractous_opts, request_extractous_opts))
- |> Keyword.put_new(:focus_window, 0)
- |> Keyword.put_new(:timeout, config(:timeout, @default_timeout))
- |> Keyword.put_new(:max_redirects, @default_max_redirects)
- |> Keyword.put_new(:cache, true)
- |> Keyword.put_new(:cache_ttl_ms, config(:cache_ttl_ms, @default_cache_ttl_ms))
+ |> maybe_put(:max_content_tokens, max_content_tokens)
+ |> maybe_put(:max_url_length, max_url_length)
|> Keyword.put_new(:known_urls, [])
{:ok, normalized}
@@ -937,7 +939,7 @@ defmodule Jido.Browser.WebFetch do
defp normalize_extractous_opts(opts) when is_list(opts) do
if Keyword.keyword?(opts) do
- {:ok, opts}
+ {:ok, canonicalize_keyword_list(opts)}
else
{:error,
Error.invalid_error("Extractous options must be a keyword list", %{
@@ -955,6 +957,62 @@ defmodule Jido.Browser.WebFetch do
})}
end
+ defp normalize_selector(nil), do: {:ok, nil}
+
+ defp normalize_selector(selector) when is_binary(selector) do
+ selector
+ |> String.trim()
+ |> case do
+ "" -> {:ok, nil}
+ value -> {:ok, value}
+ end
+ end
+
+ defp normalize_selector(selector) do
+ {:error,
+ Error.invalid_error("Selector must be a string", %{
+ error_code: :invalid_input,
+ selector: selector
+ })}
+ end
+
+ defp normalize_integer_opt(_name, value, min: min) when is_integer(value) and value >= min, do: {:ok, value}
+
+ defp normalize_integer_opt(name, value, min: min) do
+ {:error,
+ Error.invalid_error("#{name} must be an integer greater than or equal to #{min}", %{
+ error_code: :invalid_input,
+ option: name,
+ value: value
+ })}
+ end
+
+ defp normalize_optional_integer_opt(_name, nil, _opts), do: {:ok, nil}
+ defp normalize_optional_integer_opt(name, value, opts), do: normalize_integer_opt(name, value, opts)
+
+ defp normalize_boolean_opt(_name, value) when is_boolean(value), do: {:ok, value}
+
+ defp normalize_boolean_opt(name, value) do
+ {:error,
+ Error.invalid_error("#{name} must be a boolean", %{
+ error_code: :invalid_input,
+ option: name,
+ value: value
+ })}
+ end
+
+ defp canonicalize_keyword_list(keyword_list) do
+ keyword_list
+ |> Enum.map(fn {key, value} = pair ->
+ if is_list(value) and Keyword.keyword?(value) do
+ {key, canonicalize_keyword_list(value)}
+ else
+ pair
+ end
+ end)
+ |> Enum.sort_by(fn {key, _value} -> to_string(key) end)
+ end
+
defp merge_extractous_opts(left, right) do
Keyword.merge(left, right, fn _key, left_value, right_value ->
if Keyword.keyword?(left_value) and Keyword.keyword?(right_value) do
@@ -1045,6 +1103,8 @@ defmodule Jido.Browser.WebFetch do
defp maybe_put_metadata(response, metadata) when metadata in [%{}, nil], do: response
defp maybe_put_metadata(response, metadata), do: Map.put(response, :metadata, metadata)
+ defp maybe_put(opts, _key, nil), do: opts
+ defp maybe_put(opts, key, value), do: Keyword.put(opts, key, value)
defp title_from_url(url) do
path = URI.parse(url).path || ""
diff --git a/test/jido_browser/web_fetch_test.exs b/test/jido_browser/web_fetch_test.exs
index 43aad25..931ada8 100644
--- a/test/jido_browser/web_fetch_test.exs
+++ b/test/jido_browser/web_fetch_test.exs
@@ -22,6 +22,7 @@ defmodule Jido.Browser.WebFetchTest do
test "fetches HTML content with selector extraction and citation passages" do
expect(Req, :run, fn opts ->
assert opts[:url] == "https://example.com/article"
+ assert opts[:decode_body] == false
request = Req.Request.new(url: "https://example.com/article")
@@ -66,6 +67,27 @@ defmodule Jido.Browser.WebFetchTest do
assert passage_text =~ "Hello"
end
+ test "preserves JSON responses as text content" do
+ expect(Req, :run, fn opts ->
+ assert opts[:decode_body] == false
+ request = Req.Request.new(url: opts[:url])
+
+ response =
+ %Req.Response{
+ status: 200,
+ headers: %{"content-type" => ["application/json"]},
+ body: ~s({"name":"jido","kind":"agent"})
+ }
+
+ {request, response}
+ end)
+
+ assert {:ok, result} = Jido.Browser.web_fetch("https://example.com/data.json", format: :text)
+
+ assert result.document_type == :text
+ assert result.content =~ ~s("name":"jido")
+ end
+
test "applies focused filtering to plain text responses" do
expect(Req, :run, fn opts ->
request = Req.Request.new(url: opts[:url])
@@ -203,6 +225,14 @@ defmodule Jido.Browser.WebFetchTest do
)
end
+ test "rejects invalid direct API options early" do
+ assert {:error, %Error.InvalidError{details: %{option: :timeout, error_code: :invalid_input}}} =
+ Jido.Browser.web_fetch("https://example.com/notes.txt", timeout: 0)
+
+ assert {:error, %Error.InvalidError{details: %{extractous: [:bad, :shape], error_code: :invalid_input}}} =
+ Jido.Browser.web_fetch("https://example.com/notes.txt", extractous: [:bad, :shape])
+ end
+
test "enforces known URL provenance when requested" do
assert {:error, %Error.InvalidError{details: %{error_code: :url_not_allowed}}} =
Jido.Browser.web_fetch(
From 002dba516f34b3ce82ad6d4c00d6624a7deba901 Mon Sep 17 00:00:00 2001
From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com>
Date: Sat, 21 Mar 2026 20:25:08 -0500
Subject: [PATCH 6/7] docs: clarify web fetch API
---
README.md | 2 +-
lib/jido_browser.ex | 8 +++++++-
2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index f6f78bd..81940c0 100644
--- a/README.md
+++ b/README.md
@@ -92,7 +92,7 @@ Selectors remain supported, but ref-based interaction is the preferred 2.0 flow:
result.content
result.passages
-result.metadata
+result.metadata # present when extraction returns document metadata
```
`web_fetch/2` keeps HTML handling native for selector extraction and markdown conversion, and uses `extractous_ex` for fetched binary documents such as PDFs, Word, Excel, PowerPoint, OpenDocument, EPUB, and common email formats. Binary document responses may also include `result.metadata` when extraction returns document metadata.
diff --git a/lib/jido_browser.ex b/lib/jido_browser.ex
index 91139ac..39a4424 100644
--- a/lib/jido_browser.ex
+++ b/lib/jido_browser.ex
@@ -109,7 +109,13 @@ defmodule Jido.Browser do
end
end
- @doc "Fetches a URL over HTTP(S) without starting a browser session."
+ @doc """
+ Fetches a URL over HTTP(S) without starting a browser session.
+
+ HTML responses keep native selector extraction and format conversion, while
+ fetched binary documents such as PDFs and office files are extracted through
+ `ExtractousEx`.
+ """
@spec web_fetch(String.t(), keyword()) :: {:ok, map()} | {:error, term()}
def web_fetch(url, opts \\ [])
From 1d495fd9436b87c7f75d87bf98131dd3a5ce621d Mon Sep 17 00:00:00 2001
From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com>
Date: Sat, 21 Mar 2026 20:39:02 -0500
Subject: [PATCH 7/7] fix: resolve lint regressions
---
lib/jido_browser/actions/web_fetch.ex | 5 +-
lib/jido_browser/plugin.ex | 17 +-
lib/jido_browser/web_fetch.ex | 277 +++++++++++++-------------
3 files changed, 142 insertions(+), 157 deletions(-)
diff --git a/lib/jido_browser/actions/web_fetch.ex b/lib/jido_browser/actions/web_fetch.ex
index 7efcaae..fa79361 100644
--- a/lib/jido_browser/actions/web_fetch.ex
+++ b/lib/jido_browser/actions/web_fetch.ex
@@ -40,11 +40,8 @@ defmodule Jido.Browser.Actions.WebFetch do
{:ok, result} <- Jido.Browser.web_fetch(params.url, build_opts(params, context)) do
{:ok, Map.put(result, :status, "success")}
else
- {:error, %_{} = error} ->
+ {:error, error} ->
{:error, error}
-
- {:error, reason} ->
- {:error, Error.adapter_error("Web fetch failed", %{reason: reason})}
end
end
diff --git a/lib/jido_browser/plugin.ex b/lib/jido_browser/plugin.ex
index 2a58bba..a99f841 100644
--- a/lib/jido_browser/plugin.ex
+++ b/lib/jido_browser/plugin.ex
@@ -285,7 +285,7 @@ defmodule Jido.Browser.Plugin do
seen_urls =
current_seen_urls
|> Kernel.++(extract_urls(result))
- |> Enum.reject(&is_nil_or_empty/1)
+ |> Enum.reject(&nil_or_empty?/1)
|> Enum.uniq()
if seen_urls == [] or seen_urls == current_seen_urls do
@@ -305,26 +305,23 @@ defmodule Jido.Browser.Plugin do
defp extract_urls(result) do
direct_urls =
[Map.get(result, :url), Map.get(result, "url"), Map.get(result, :final_url), Map.get(result, "final_url")]
- |> Enum.reject(&is_nil_or_empty/1)
+ |> Enum.reject(&nil_or_empty?/1)
search_urls =
result
|> Map.get(:results, Map.get(result, "results", []))
|> List.wrap()
|> Enum.map(fn item ->
- cond do
- is_map(item) -> Map.get(item, :url) || Map.get(item, "url")
- true -> nil
- end
+ if is_map(item), do: Map.get(item, :url) || Map.get(item, "url")
end)
- |> Enum.reject(&is_nil_or_empty/1)
+ |> Enum.reject(&nil_or_empty?/1)
direct_urls ++ search_urls
end
- defp is_nil_or_empty(nil), do: true
- defp is_nil_or_empty(""), do: true
- defp is_nil_or_empty(_value), do: false
+ defp nil_or_empty?(nil), do: true
+ defp nil_or_empty?(""), do: true
+ defp nil_or_empty?(_value), do: false
def signal_patterns do
[
diff --git a/lib/jido_browser/web_fetch.ex b/lib/jido_browser/web_fetch.ex
index 5c5cf81..1eae9aa 100644
--- a/lib/jido_browser/web_fetch.ex
+++ b/lib/jido_browser/web_fetch.ex
@@ -177,9 +177,6 @@ defmodule Jido.Browser.WebFetch do
{_request, %_{} = exception} ->
{:error, Error.adapter_error("Web fetch failed", %{error_code: :unavailable, reason: exception})}
-
- {_request, reason} ->
- {:error, Error.adapter_error("Web fetch failed", %{error_code: :unavailable, reason: reason})}
end
end
@@ -267,62 +264,49 @@ defmodule Jido.Browser.WebFetch do
})}
end
- defp build_response(
- url,
- final_url,
- content,
- title,
- content_type,
- document_type,
- opts,
- truncated,
- filtered,
- focus_matches,
- original_estimated_tokens,
- metadata
- ) do
- passages = maybe_build_passages(content, title, final_url, opts[:citations])
+ defp build_response(opts, attrs) do
+ passages = maybe_build_passages(attrs.content, attrs.title, attrs.final_url, opts[:citations])
%{
- url: url,
- final_url: final_url,
- title: title,
- content: content,
+ url: attrs.url,
+ final_url: attrs.final_url,
+ title: attrs.title,
+ content: attrs.content,
format: opts[:format],
- content_type: content_type,
- document_type: document_type,
+ content_type: attrs.content_type,
+ document_type: attrs.document_type,
retrieved_at: retrieved_at(),
- estimated_tokens: estimate_tokens(content),
- original_estimated_tokens: original_estimated_tokens,
- truncated: truncated,
- filtered: filtered,
- focus_matches: focus_matches,
+ estimated_tokens: estimate_tokens(attrs.content),
+ original_estimated_tokens: attrs.original_estimated_tokens,
+ truncated: attrs.truncated,
+ filtered: attrs.filtered,
+ focus_matches: attrs.focus_matches,
cached: false,
citations: %{enabled: opts[:citations]},
passages: passages
}
- |> maybe_put_metadata(metadata)
+ |> maybe_put_metadata(attrs.metadata)
end
defp finalize_result(url, final_url, content, title, content_type, document_type, opts, metadata \\ nil) do
with {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts),
{final_content, truncated, original_estimated_tokens} <-
maybe_truncate(filtered_content, opts[:max_content_tokens]) do
- {:ok,
- build_response(
- url,
- final_url,
- final_content,
- title,
- content_type,
- document_type,
- opts,
- truncated,
- filtered,
- focus_matches,
- original_estimated_tokens,
- metadata
- )}
+ attrs = %{
+ url: url,
+ final_url: final_url,
+ content: final_content,
+ title: title,
+ content_type: content_type,
+ document_type: document_type,
+ truncated: truncated,
+ filtered: filtered,
+ focus_matches: focus_matches,
+ original_estimated_tokens: original_estimated_tokens,
+ metadata: metadata
+ }
+
+ {:ok, build_response(opts, attrs)}
end
end
@@ -423,41 +407,10 @@ defmodule Jido.Browser.WebFetch do
normalized_url = String.trim(url)
max_url_length = opts[:max_url_length] || @default_max_url_length
- cond do
- normalized_url == "" ->
- {:error, Error.invalid_error("URL cannot be empty", %{error_code: :invalid_input})}
-
- String.length(normalized_url) > max_url_length ->
- {:error,
- Error.invalid_error("URL exceeds maximum length", %{
- error_code: :url_too_long,
- max_url_length: max_url_length
- })}
-
- true ->
- uri = URI.parse(normalized_url)
-
- cond do
- uri.scheme not in ["http", "https"] ->
- {:error,
- Error.invalid_error("Web fetch only supports http and https URLs", %{
- error_code: :invalid_input,
- scheme: uri.scheme
- })}
-
- is_nil(uri.host) or uri.host == "" ->
- {:error, Error.invalid_error("URL must include a host", %{error_code: :invalid_input})}
-
- not ascii_only?(uri.host) ->
- {:error,
- Error.invalid_error("Web fetch only accepts ASCII hostnames", %{
- error_code: :url_not_allowed,
- host: uri.host
- })}
-
- true ->
- {:ok, URI.to_string(uri), normalize_uri(uri)}
- end
+ with :ok <- validate_url_length(normalized_url, max_url_length),
+ {:ok, uri} <- parse_fetch_uri(normalized_url),
+ :ok <- validate_uri_host(uri) do
+ {:ok, URI.to_string(uri), normalize_uri(uri)}
end
end
@@ -468,9 +421,7 @@ defmodule Jido.Browser.WebFetch do
|> Enum.map(&normalize_known_url/1)
|> Enum.reject(&is_nil/1)
- if not Keyword.get(opts, :require_known_url, false) do
- :ok
- else
+ if Keyword.get(opts, :require_known_url, false) do
if url in known_urls do
:ok
else
@@ -480,6 +431,8 @@ defmodule Jido.Browser.WebFetch do
url: url
})}
end
+ else
+ :ok
end
end
@@ -566,7 +519,7 @@ defmodule Jido.Browser.WebFetch do
end
defp rule_matches?(%{host: host, path: path}, %URI{host: uri_host} = uri) do
- uri_host = String.downcase(uri_host || "")
+ uri_host = String.downcase(uri_host)
request_path = normalize_rule_path(uri.path || "/")
host_matches? = uri_host == host or String.ends_with?(uri_host, "." <> host)
@@ -677,36 +630,10 @@ defmodule Jido.Browser.WebFetch do
terms ->
sections = split_sections(content)
- downcased_terms = Enum.map(terms, &String.downcase/1)
-
- matching_indexes =
- sections
- |> Enum.with_index()
- |> Enum.flat_map(fn {section, index} ->
- lowered = String.downcase(section)
-
- if Enum.any?(downcased_terms, &String.contains?(lowered, &1)) do
- [index]
- else
- []
- end
- end)
-
+ matching_indexes = matching_section_indexes(sections, terms)
window = max(opts[:focus_window] || 0, 0)
-
- kept_indexes =
- matching_indexes
- |> Enum.flat_map(fn index -> (index - window)..(index + window) end)
- |> Enum.filter(&(&1 >= 0 and &1 < length(sections)))
- |> Enum.uniq()
- |> Enum.sort()
-
- filtered_content =
- kept_indexes
- |> Enum.map(&Enum.at(sections, &1))
- |> Enum.reject(&(&1 == ""))
- |> Enum.join("\n\n")
- |> String.trim()
+ kept_indexes = expand_focus_window(matching_indexes, window, length(sections))
+ filtered_content = render_section_slice(sections, kept_indexes)
{:ok, filtered_content, true, length(matching_indexes)}
end
@@ -768,19 +695,6 @@ defmodule Jido.Browser.WebFetch do
{:ok, %{content: content, metadata: metadata}} when is_binary(content) ->
{:ok, String.trim(content), normalize_metadata(metadata)}
- {:ok, %{content: content}} when is_binary(content) ->
- {:ok, String.trim(content), %{}}
-
- {:ok, result} ->
- {:error,
- Error.adapter_error("ExtractousEx returned an unexpected document payload", %{
- error_code: :unavailable,
- url: final_url,
- content_type: content_type,
- document_type: document_type,
- result: result
- })}
-
{:error, reason} ->
{:error,
Error.adapter_error("ExtractousEx failed while extracting document content", %{
@@ -806,25 +720,28 @@ defmodule Jido.Browser.WebFetch do
defp fetch_cached(url, opts) do
if opts[:cache] do
ensure_cache_table!()
- now = System.system_time(:millisecond)
-
- case :ets.lookup(@cache_table, cache_key(url, opts)) do
- [{_key, expires_at, result}] ->
- if expires_at > now do
- {:ok, Map.put(result, :cached, true)}
- else
- :ets.delete(@cache_table, cache_key(url, opts))
- :miss
- end
-
- [] ->
- :miss
- end
+ lookup_cached_result(cache_key(url, opts), System.system_time(:millisecond))
else
:miss
end
end
+ defp lookup_cached_result(key, now) do
+ case :ets.lookup(@cache_table, key) do
+ [{_key, expires_at, result}] -> handle_cached_result(key, expires_at, result, now)
+ [] -> :miss
+ end
+ end
+
+ defp handle_cached_result(_key, expires_at, result, now) when expires_at > now do
+ {:ok, Map.put(result, :cached, true)}
+ end
+
+ defp handle_cached_result(key, _expires_at, _result, _now) do
+ :ets.delete(@cache_table, key)
+ :miss
+ end
+
defp maybe_store_cache(url, opts, result) do
if opts[:cache] do
ensure_cache_table!()
@@ -976,6 +893,52 @@ defmodule Jido.Browser.WebFetch do
})}
end
+ defp validate_url_length("", _max_url_length) do
+ {:error, Error.invalid_error("URL cannot be empty", %{error_code: :invalid_input})}
+ end
+
+ defp validate_url_length(normalized_url, max_url_length) do
+ if String.length(normalized_url) > max_url_length do
+ {:error,
+ Error.invalid_error("URL exceeds maximum length", %{
+ error_code: :url_too_long,
+ max_url_length: max_url_length
+ })}
+ else
+ :ok
+ end
+ end
+
+ defp parse_fetch_uri(normalized_url) do
+ uri = URI.parse(normalized_url)
+
+ if uri.scheme in ["http", "https"] do
+ {:ok, uri}
+ else
+ {:error,
+ Error.invalid_error("Web fetch only supports http and https URLs", %{
+ error_code: :invalid_input,
+ scheme: uri.scheme
+ })}
+ end
+ end
+
+ defp validate_uri_host(%URI{host: host}) when host in [nil, ""] do
+ {:error, Error.invalid_error("URL must include a host", %{error_code: :invalid_input})}
+ end
+
+ defp validate_uri_host(%URI{host: host}) do
+ if ascii_only?(host) do
+ :ok
+ else
+ {:error,
+ Error.invalid_error("Web fetch only accepts ASCII hostnames", %{
+ error_code: :url_not_allowed,
+ host: host
+ })}
+ end
+ end
+
defp normalize_integer_opt(_name, value, min: min) when is_integer(value) and value >= min, do: {:ok, value}
defp normalize_integer_opt(name, value, min: min) do
@@ -1086,8 +1049,6 @@ defmodule Jido.Browser.WebFetch do
end)
end
- defp metadata_title(_metadata), do: nil
-
defp metadata_value_to_string(nil), do: nil
defp metadata_value_to_string(value) when is_binary(value), do: String.trim(value)
@@ -1099,13 +1060,43 @@ defmodule Jido.Browser.WebFetch do
defp metadata_value_to_string(_value), do: nil
defp normalize_metadata(metadata) when is_map(metadata), do: metadata
- defp normalize_metadata(_metadata), do: %{}
defp maybe_put_metadata(response, metadata) when metadata in [%{}, nil], do: response
defp maybe_put_metadata(response, metadata), do: Map.put(response, :metadata, metadata)
defp maybe_put(opts, _key, nil), do: opts
defp maybe_put(opts, key, value), do: Keyword.put(opts, key, value)
+ defp matching_section_indexes(sections, terms) do
+ downcased_terms = Enum.map(terms, &String.downcase/1)
+
+ sections
+ |> Enum.with_index()
+ |> Enum.flat_map(fn {section, index} ->
+ if section_matches_term?(section, downcased_terms), do: [index], else: []
+ end)
+ end
+
+ defp section_matches_term?(section, downcased_terms) do
+ lowered = String.downcase(section)
+ Enum.any?(downcased_terms, &String.contains?(lowered, &1))
+ end
+
+ defp expand_focus_window(matching_indexes, window, section_count) do
+ matching_indexes
+ |> Enum.flat_map(fn index -> (index - window)..(index + window) end)
+ |> Enum.filter(&(&1 >= 0 and &1 < section_count))
+ |> Enum.uniq()
+ |> Enum.sort()
+ end
+
+ defp render_section_slice(sections, indexes) do
+ indexes
+ |> Enum.map(&Enum.at(sections, &1))
+ |> Enum.reject(&(&1 == ""))
+ |> Enum.join("\n\n")
+ |> String.trim()
+ end
+
defp title_from_url(url) do
path = URI.parse(url).path || ""