From 10427687dde34acb3ac855f8ea51e1782bc605d1 Mon Sep 17 00:00:00 2001 From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com> Date: Sat, 21 Mar 2026 19:59:43 -0500 Subject: [PATCH 1/7] feat: add HTTP-first web fetch tool --- CHANGELOG.md | 6 +- README.md | 26 + lib/jido_browser.ex | 24 + lib/jido_browser/actions/web_fetch.ex | 92 ++ lib/jido_browser/plugin.ex | 107 +- lib/jido_browser/web_fetch.ex | 915 ++++++++++++++++++ mix.exs | 7 +- .../composite_actions_and_installer_test.exs | 59 ++ test/jido_browser/plugin_test.exs | 44 +- test/jido_browser/web_fetch_test.exs | 166 ++++ 10 files changed, 1418 insertions(+), 28 deletions(-) create mode 100644 lib/jido_browser/actions/web_fetch.ex create mode 100644 lib/jido_browser/web_fetch.ex create mode 100644 test/jido_browser/web_fetch_test.exs diff --git a/CHANGELOG.md b/CHANGELOG.md index fb19c91..99820c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Add HTTP-first `Jido.Browser.web_fetch/2` and `Jido.Browser.Actions.WebFetch` for stateless page retrieval with domain policy, focused filtering, caching, and citation-ready passages + ### Changed - Rename the public Elixir namespace from `JidoBrowser.*` to `Jido.Browser.*` @@ -110,4 +114,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Refactoring: -* streamline agent-browser runtime defaults by mikehostetler \ No newline at end of file +* streamline agent-browser runtime defaults by mikehostetler diff --git a/README.md b/README.md index b7204cc..c1d1808 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,22 @@ Selectors remain supported, but ref-based interaction is the preferred 2.0 flow: 2. act on `@eN` refs 3. re-snapshot +### Stateless Web Fetch + +```elixir +{:ok, result} = + Jido.Browser.web_fetch( + "https://example.com/docs", + format: :markdown, + allowed_domains: ["example.com"], + focus_terms: ["API", "authentication"], + citations: true + ) + +result.content +result.passages +``` + ### State Persistence ```elixir @@ -143,6 +159,14 @@ config :jido_browser, :web, profile: "default" ``` +Optional web fetch settings: + +```elixir +config :jido_browser, :web_fetch, + cache_ttl_ms: 300_000, + pdftotext_path: "/usr/local/bin/pdftotext" +``` + ## Backends ### AgentBrowser (Default) @@ -173,6 +197,7 @@ Core operations: - `type/4` - `screenshot/2` - `extract_content/2` +- `web_fetch/2` - `evaluate/3` Agent-browser-native operations: @@ -252,6 +277,7 @@ Agent-browser-native operations: - `ReadPage` - `SnapshotUrl` - `SearchWeb` +- `WebFetch` ## Using With Jido Agents diff --git a/lib/jido_browser.ex b/lib/jido_browser.ex index 58f3e3a..91139ac 100644 --- a/lib/jido_browser.ex +++ b/lib/jido_browser.ex @@ -8,11 +8,13 @@ defmodule Jido.Browser do alias Jido.Browser.Error alias Jido.Browser.Session + alias Jido.Browser.WebFetch @default_adapter Jido.Browser.Adapters.AgentBrowser @default_timeout 30_000 @supported_screenshot_formats [:png] @supported_extract_formats [:markdown, :html, :text] + @supported_web_fetch_formats [:markdown, :html, :text] @doc "Starts a browser session using the configured adapter or an explicit adapter override." @spec start_session(keyword()) :: {:ok, Session.t()} | {:error, term()} @@ -107,6 +109,28 @@ defmodule Jido.Browser do end end + @doc "Fetches a URL over HTTP(S) without starting a browser session." + @spec web_fetch(String.t(), keyword()) :: {:ok, map()} | {:error, term()} + def web_fetch(url, opts \\ []) + + def web_fetch(url, _opts) when url in [nil, ""] do + {:error, Error.invalid_error("URL cannot be nil or empty", %{url: url})} + end + + def web_fetch(url, opts) when is_binary(url) do + format = opts[:format] || :markdown + + if format in @supported_web_fetch_formats do + WebFetch.fetch(url, normalize_timeout(opts)) + else + {:error, + Error.invalid_error("Unsupported web fetch format: #{inspect(format)}", %{ + format: format, + supported: @supported_web_fetch_formats + })} + end + end + @doc "Evaluates JavaScript in the browser when the adapter supports it." @spec evaluate(Session.t(), String.t(), keyword()) :: {:ok, Session.t(), map()} | {:error, term()} diff --git a/lib/jido_browser/actions/web_fetch.ex b/lib/jido_browser/actions/web_fetch.ex new file mode 100644 index 0000000..417f07d --- /dev/null +++ b/lib/jido_browser/actions/web_fetch.ex @@ -0,0 +1,92 @@ +defmodule Jido.Browser.Actions.WebFetch do + @moduledoc """ + Stateless HTTP-first page retrieval for agent workflows. + + `WebFetch` is a lighter-weight alternative to browser navigation when the + target content can be retrieved over plain HTTP(S) without JavaScript + execution. + """ + + use Jido.Action, + name: "web_fetch", + description: + "Fetch a URL over HTTP(S) with domain policy controls, optional focused filtering, " <> + "approximate token caps, and citation-ready passages.", + category: "Browser", + tags: ["browser", "web", "fetch", "http", "retrieval"], + vsn: "2.0.0", + schema: [ + url: [type: :string, required: true, doc: "The URL to fetch"], + format: [type: {:in, [:markdown, :text, :html]}, default: :markdown, doc: "Output format"], + selector: [type: :string, doc: "Optional CSS selector for HTML pages"], + allowed_domains: [type: {:list, :string}, default: [], doc: "Allow-list of host or host/path rules"], + blocked_domains: [type: {:list, :string}, default: [], doc: "Block-list of host or host/path rules"], + focus_terms: [type: {:list, :string}, default: [], doc: "Terms used to filter the fetched document"], + focus_window: [type: :integer, default: 0, doc: "Paragraph window around each focus match"], + max_content_tokens: [type: :integer, doc: "Approximate token cap for returned content"], + citations: [type: :boolean, default: false, doc: "Include citation-ready passage offsets"], + cache: [type: :boolean, default: true, doc: "Reuse cached fetch results when available"], + timeout: [type: :integer, doc: "Receive timeout in milliseconds"], + require_known_url: [type: :boolean, default: false, doc: "Require the URL to already be present in tool context"], + known_urls: [type: {:list, :string}, default: [], doc: "Additional known URLs accepted for provenance checks"], + max_uses: [type: :integer, doc: "Maximum successful web fetch calls allowed in current skill state"] + ] + + alias Jido.Browser.Error + + @impl true + def run(params, context) do + with :ok <- validate_max_uses(params, context), + {:ok, result} <- Jido.Browser.web_fetch(params.url, build_opts(params, context)) do + {:ok, Map.put(result, :status, "success")} + else + {:error, %_{} = error} -> + {:error, error} + + {:error, reason} -> + {:error, Error.adapter_error("Web fetch failed", %{reason: reason})} + end + end + + defp build_opts(params, context) do + known_urls = + (Map.get(params, :known_urls, []) || []) + |> Kernel.++(get_in(context, [:skill_state, :seen_urls]) || []) + |> Enum.uniq() + + [] + |> maybe_put(:format, Map.get(params, :format, :markdown)) + |> maybe_put(:selector, params[:selector]) + |> maybe_put(:allowed_domains, Map.get(params, :allowed_domains, [])) + |> maybe_put(:blocked_domains, Map.get(params, :blocked_domains, [])) + |> maybe_put(:focus_terms, Map.get(params, :focus_terms, [])) + |> maybe_put(:focus_window, Map.get(params, :focus_window, 0)) + |> maybe_put(:max_content_tokens, params[:max_content_tokens]) + |> maybe_put(:citations, Map.get(params, :citations, false)) + |> maybe_put(:cache, Map.get(params, :cache, true)) + |> maybe_put(:timeout, params[:timeout]) + |> maybe_put(:require_known_url, Map.get(params, :require_known_url, false)) + |> maybe_put(:known_urls, known_urls) + end + + defp validate_max_uses(%{max_uses: max_uses}, context) when is_integer(max_uses) and max_uses >= 0 do + current_uses = get_in(context, [:skill_state, :web_fetch_uses]) || 0 + + if current_uses >= max_uses do + {:error, + Error.invalid_error("Web fetch max uses exceeded", %{ + error_code: :max_uses_exceeded, + max_uses: max_uses, + current_uses: current_uses + })} + else + :ok + end + end + + defp validate_max_uses(_params, _context), do: :ok + + defp maybe_put(opts, _key, nil), do: opts + defp maybe_put(opts, _key, []), do: opts + defp maybe_put(opts, key, value), do: Keyword.put(opts, key, value) +end diff --git a/lib/jido_browser/plugin.ex b/lib/jido_browser/plugin.ex index 4e99408..2a58bba 100644 --- a/lib/jido_browser/plugin.ex +++ b/lib/jido_browser/plugin.ex @@ -36,6 +36,7 @@ require Jido.Browser.Actions.WaitForSelector require Jido.Browser.Actions.ReadPage require Jido.Browser.Actions.SnapshotUrl require Jido.Browser.Actions.SearchWeb +require Jido.Browser.Actions.WebFetch defmodule Jido.Browser.Plugin do @moduledoc """ @@ -119,7 +120,8 @@ defmodule Jido.Browser.Plugin do # Self-contained composite actions (manage own session) Jido.Browser.Actions.ReadPage, Jido.Browser.Actions.SnapshotUrl, - Jido.Browser.Actions.SearchWeb + Jido.Browser.Actions.SearchWeb, + Jido.Browser.Actions.WebFetch ], description: "Browser automation for web navigation, interaction, and content extraction", category: "browser", @@ -136,7 +138,9 @@ defmodule Jido.Browser.Plugin do viewport: Map.get(config, :viewport, %{width: 1280, height: 720}), base_url: Map.get(config, :base_url), last_url: nil, - last_title: nil + last_title: nil, + seen_urls: [], + web_fetch_uses: 0 } {:ok, initial_state} @@ -151,7 +155,9 @@ defmodule Jido.Browser.Plugin do viewport: Zoi.any(description: "Browser viewport dimensions") |> Zoi.optional(), base_url: Zoi.string(description: "Base URL for relative navigation") |> Zoi.optional(), last_url: Zoi.string(description: "Last navigated URL") |> Zoi.optional(), - last_title: Zoi.string(description: "Last page title") |> Zoi.optional() + last_title: Zoi.string(description: "Last page title") |> Zoi.optional(), + seen_urls: Zoi.array(Zoi.string(description: "Known URLs discovered during tool use")) |> Zoi.default([]), + web_fetch_uses: Zoi.integer(description: "Successful web fetch calls in current skill state") |> Zoi.default(0) }) end @@ -204,7 +210,8 @@ defmodule Jido.Browser.Plugin do # Self-contained composite actions {"browser.read_page", Jido.Browser.Actions.ReadPage}, {"browser.snapshot_url", Jido.Browser.Actions.SnapshotUrl}, - {"browser.search_web", Jido.Browser.Actions.SearchWeb} + {"browser.search_web", Jido.Browser.Actions.SearchWeb}, + {"browser.web_fetch", Jido.Browser.Actions.WebFetch} ] end @@ -214,22 +221,17 @@ defmodule Jido.Browser.Plugin do end @impl Jido.Plugin - def transform_result(_action, {:ok, result}, _context) when is_map(result) do - case Map.get(result, :session) do - %Jido.Browser.Session{} = session -> - current_url = Map.get(result, :url) || Map.get(result, "url") || get_in(session, [:connection, :current_url]) - current_title = Map.get(result, :title) || Map.get(result, "title") || get_in(session, [:connection, :title]) - - state_updates = %{ - session: session, - last_url: current_url, - last_title: current_title - } - - {:ok, result, state_updates} + def transform_result(action, {:ok, result}, context) when is_map(result) do + state_updates = + %{} + |> maybe_put_session_state(result) + |> maybe_put_seen_urls(result, context) + |> maybe_increment_web_fetch_uses(action, context) - _ -> - {:ok, result} + if map_size(state_updates) == 0 do + {:ok, result} + else + {:ok, result, state_updates} end end @@ -260,6 +262,70 @@ defmodule Jido.Browser.Plugin do end end + defp maybe_put_session_state(acc, result) do + case Map.get(result, :session) do + %Jido.Browser.Session{} = session -> + current_url = Map.get(result, :url) || Map.get(result, "url") || get_in(session, [:connection, :current_url]) + current_title = Map.get(result, :title) || Map.get(result, "title") || get_in(session, [:connection, :title]) + + Map.merge(acc, %{ + session: session, + last_url: current_url, + last_title: current_title + }) + + _ -> + acc + end + end + + defp maybe_put_seen_urls(acc, result, context) do + current_seen_urls = get_in(context, [:skill_state, :seen_urls]) || [] + + seen_urls = + current_seen_urls + |> Kernel.++(extract_urls(result)) + |> Enum.reject(&is_nil_or_empty/1) + |> Enum.uniq() + + if seen_urls == [] or seen_urls == current_seen_urls do + acc + else + Map.put(acc, :seen_urls, seen_urls) + end + end + + defp maybe_increment_web_fetch_uses(acc, Jido.Browser.Actions.WebFetch, context) do + current_uses = get_in(context, [:skill_state, :web_fetch_uses]) || 0 + Map.put(acc, :web_fetch_uses, current_uses + 1) + end + + defp maybe_increment_web_fetch_uses(acc, _action, _context), do: acc + + defp extract_urls(result) do + direct_urls = + [Map.get(result, :url), Map.get(result, "url"), Map.get(result, :final_url), Map.get(result, "final_url")] + |> Enum.reject(&is_nil_or_empty/1) + + search_urls = + result + |> Map.get(:results, Map.get(result, "results", [])) + |> List.wrap() + |> Enum.map(fn item -> + cond do + is_map(item) -> Map.get(item, :url) || Map.get(item, "url") + true -> nil + end + end) + |> Enum.reject(&is_nil_or_empty/1) + + direct_urls ++ search_urls + end + + defp is_nil_or_empty(nil), do: true + defp is_nil_or_empty(""), do: true + defp is_nil_or_empty(_value), do: false + def signal_patterns do [ # Session lifecycle @@ -308,7 +374,8 @@ defmodule Jido.Browser.Plugin do # Self-contained composite actions "browser.read_page", "browser.snapshot_url", - "browser.search_web" + "browser.search_web", + "browser.web_fetch" ] end end diff --git a/lib/jido_browser/web_fetch.ex b/lib/jido_browser/web_fetch.ex new file mode 100644 index 0000000..8599598 --- /dev/null +++ b/lib/jido_browser/web_fetch.ex @@ -0,0 +1,915 @@ +defmodule Jido.Browser.WebFetch do + @moduledoc """ + Stateless HTTP-first web retrieval with optional domain policy, caching, + focused filtering, and citation-ready passage metadata. + + This module is intended for document retrieval workloads where starting a full + browser session would be unnecessary or too expensive. + """ + + alias Jido.Browser.Error + + @cache_table :jido_browser_web_fetch_cache + @default_timeout 15_000 + @default_max_redirects 5 + @default_cache_ttl_ms 300_000 + @default_max_url_length 2_048 + @supported_formats [:markdown, :text, :html] + @html_content_types ["text/html", "application/xhtml+xml"] + @text_content_types ["text/plain", "text/markdown", "text/csv", "text/xml", "application/xml"] + @pdf_content_types ["application/pdf"] + + @type result :: %{ + required(:url) => String.t(), + required(:final_url) => String.t(), + required(:content) => String.t(), + required(:format) => atom(), + required(:content_type) => String.t(), + required(:document_type) => atom(), + required(:retrieved_at) => String.t(), + required(:estimated_tokens) => non_neg_integer(), + required(:original_estimated_tokens) => non_neg_integer(), + required(:truncated) => boolean(), + required(:filtered) => boolean(), + required(:focus_matches) => non_neg_integer(), + required(:cached) => boolean(), + required(:citations) => %{enabled: boolean()}, + required(:passages) => list(map()), + optional(:title) => String.t() | nil + } + + @doc """ + Fetches a URL over HTTP(S) and returns normalized document content. + + Supported options: + - `:format` - `:markdown`, `:text`, or `:html` + - `:selector` - CSS selector for HTML pages + - `:allowed_domains` / `:blocked_domains` - mutually exclusive host/path rules + - `:max_content_tokens` - approximate token cap + - `:citations` - boolean, when true include passage spans + - `:focus_terms` - list of terms used for focused filtering + - `:focus_window` - paragraph window around focus matches + - `:timeout` - receive timeout in milliseconds + - `:cache` - enable ETS cache, defaults to `true` + - `:cache_ttl_ms` - cache TTL in milliseconds + - `:require_known_url` / `:known_urls` - optional URL provenance guard + """ + @spec fetch(String.t(), keyword()) :: {:ok, result()} | {:error, Exception.t()} + def fetch(url, opts \\ []) + + def fetch(url, opts) when is_binary(url) and is_list(opts) do + with {:ok, opts} <- normalize_opts(opts), + {:ok, normalized_url, uri} <- validate_url(url, opts), + :ok <- validate_known_url(normalized_url, opts), + :ok <- validate_domain_filters(uri, opts) do + case fetch_cached(normalized_url, opts) do + {:ok, result} -> + {:ok, result} + + :miss -> + do_fetch(normalized_url, opts) + end + end + end + + def fetch(url, _opts) do + {:error, Error.invalid_error("URL must be a non-empty string", %{error_code: :invalid_input, url: url})} + end + + @doc false + @spec clear_cache() :: :ok + def clear_cache do + case :ets.whereis(@cache_table) do + :undefined -> + :ok + + table -> + :ets.delete_all_objects(table) + :ok + end + end + + defp do_fetch(url, opts) do + request_opts = [ + url: url, + headers: request_headers(), + receive_timeout: opts[:timeout], + redirect: true, + max_redirects: opts[:max_redirects] + ] + + case Req.run(request_opts) do + {%Req.Request{} = request, %Req.Response{} = response} -> + with :ok <- validate_http_status(response, url), + {:ok, final_url, final_uri} <- normalize_final_url(request), + :ok <- validate_domain_filters(final_uri, opts), + {:ok, result} <- build_result(url, final_url, response, opts) do + maybe_store_cache(url, opts, result) + {:ok, result} + end + + {_request, %Req.TransportError{} = exception} -> + {:error, Error.adapter_error("Web fetch request failed", %{error_code: :url_not_accessible, reason: exception})} + + {_request, %Req.TooManyRedirectsError{} = exception} -> + {:error, + Error.adapter_error("Web fetch exceeded redirect limit", %{error_code: :url_not_accessible, reason: exception})} + + {_request, %_{} = exception} -> + {:error, Error.adapter_error("Web fetch failed", %{error_code: :unavailable, reason: exception})} + + {_request, reason} -> + {:error, Error.adapter_error("Web fetch failed", %{error_code: :unavailable, reason: reason})} + end + end + + defp build_result(url, final_url, response, opts) do + content_type = response_content_type(response) + + cond do + content_type in @html_content_types -> + build_html_result(url, final_url, response.body, content_type, opts) + + content_type in @pdf_content_types -> + build_pdf_result(url, final_url, response.body, content_type, opts) + + text_content_type?(content_type) -> + build_text_result(url, final_url, response.body, content_type, opts) + + true -> + {:error, + Error.adapter_error("Unsupported content type for web fetch", %{ + error_code: :unsupported_content_type, + content_type: content_type + })} + end + end + + defp build_html_result(url, final_url, body, content_type, opts) when is_binary(body) do + selector = opts[:selector] + + with {:ok, document} <- parse_document(body), + {:ok, html} <- select_html(document, body, selector), + {:ok, title} <- extract_title(document), + {:ok, content} <- format_html(html, opts[:format], opts), + {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts), + {final_content, truncated, original_estimated_tokens} <- + maybe_truncate(filtered_content, opts[:max_content_tokens]) do + {:ok, + build_response( + url, + final_url, + final_content, + title, + content_type, + :html, + opts, + truncated, + filtered, + focus_matches, + original_estimated_tokens + )} + end + end + + defp build_html_result(_url, _final_url, body, content_type, _opts) do + {:error, + Error.adapter_error("Unexpected response body for HTML fetch", %{ + error_code: :unavailable, + content_type: content_type, + body: body + })} + end + + defp build_text_result(url, final_url, body, content_type, opts) when is_binary(body) do + if opts[:selector] do + {:error, + Error.invalid_error("Selector filtering is only supported for HTML content", %{ + error_code: :invalid_input, + selector: opts[:selector], + content_type: content_type + })} + else + with {:ok, content} <- format_text(body, opts[:format]), + {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts), + {final_content, truncated, original_estimated_tokens} <- + maybe_truncate(filtered_content, opts[:max_content_tokens]) do + {:ok, + build_response( + url, + final_url, + final_content, + nil, + content_type, + :text, + opts, + truncated, + filtered, + focus_matches, + original_estimated_tokens + )} + end + end + end + + defp build_text_result(_url, _final_url, body, content_type, _opts) do + {:error, + Error.adapter_error("Unexpected response body for text fetch", %{ + error_code: :unavailable, + content_type: content_type, + body: body + })} + end + + defp build_pdf_result(url, final_url, body, content_type, opts) when is_binary(body) do + cond do + opts[:selector] -> + {:error, + Error.invalid_error("Selector filtering is not supported for PDF content", %{ + error_code: :invalid_input, + selector: opts[:selector], + content_type: content_type + })} + + opts[:format] == :html -> + {:error, + Error.invalid_error("HTML output is not supported for PDF content", %{ + error_code: :invalid_input, + format: :html, + content_type: content_type + })} + + true -> + with {:ok, text} <- extract_pdf_text(body), + {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(text, opts), + {final_content, truncated, original_estimated_tokens} <- + maybe_truncate(filtered_content, opts[:max_content_tokens]) do + {:ok, + build_response( + url, + final_url, + final_content, + title_from_url(final_url), + content_type, + :pdf, + opts, + truncated, + filtered, + focus_matches, + original_estimated_tokens + )} + end + end + end + + defp build_pdf_result(_url, _final_url, body, content_type, _opts) do + {:error, + Error.adapter_error("Unexpected response body for PDF fetch", %{ + error_code: :unavailable, + content_type: content_type, + body: body + })} + end + + defp build_response( + url, + final_url, + content, + title, + content_type, + document_type, + opts, + truncated, + filtered, + focus_matches, + original_estimated_tokens + ) do + passages = maybe_build_passages(content, title, final_url, opts[:citations]) + + %{ + url: url, + final_url: final_url, + title: title, + content: content, + format: opts[:format], + content_type: content_type, + document_type: document_type, + retrieved_at: retrieved_at(), + estimated_tokens: estimate_tokens(content), + original_estimated_tokens: original_estimated_tokens, + truncated: truncated, + filtered: filtered, + focus_matches: focus_matches, + cached: false, + citations: %{enabled: opts[:citations]}, + passages: passages + } + end + + defp normalize_opts(opts) do + format = opts[:format] || :markdown + citations = normalize_citations(opts[:citations]) + focus_terms = normalize_focus_terms(opts[:focus_terms]) + + cond do + format not in @supported_formats -> + {:error, + Error.invalid_error("Unsupported web fetch format", %{ + error_code: :invalid_input, + format: format, + supported_formats: @supported_formats + })} + + present_domain_rules?(opts[:allowed_domains]) and present_domain_rules?(opts[:blocked_domains]) -> + {:error, + Error.invalid_error("Use either allowed_domains or blocked_domains, not both", %{ + error_code: :invalid_input + })} + + format == :html and focus_terms != [] -> + {:error, + Error.invalid_error("Focused filtering is only supported for markdown and text output", %{ + error_code: :invalid_input, + format: format + })} + + true -> + normalized = + opts + |> Keyword.put(:format, format) + |> Keyword.put(:citations, citations) + |> Keyword.put(:focus_terms, focus_terms) + |> Keyword.put_new(:focus_window, 0) + |> Keyword.put_new(:timeout, config(:timeout, @default_timeout)) + |> Keyword.put_new(:max_redirects, @default_max_redirects) + |> Keyword.put_new(:cache, true) + |> Keyword.put_new(:cache_ttl_ms, config(:cache_ttl_ms, @default_cache_ttl_ms)) + |> Keyword.put_new(:known_urls, []) + + {:ok, normalized} + end + end + + defp validate_url(url, opts) do + normalized_url = String.trim(url) + max_url_length = opts[:max_url_length] || @default_max_url_length + + cond do + normalized_url == "" -> + {:error, Error.invalid_error("URL cannot be empty", %{error_code: :invalid_input})} + + String.length(normalized_url) > max_url_length -> + {:error, + Error.invalid_error("URL exceeds maximum length", %{ + error_code: :url_too_long, + max_url_length: max_url_length + })} + + true -> + uri = URI.parse(normalized_url) + + cond do + uri.scheme not in ["http", "https"] -> + {:error, + Error.invalid_error("Web fetch only supports http and https URLs", %{ + error_code: :invalid_input, + scheme: uri.scheme + })} + + is_nil(uri.host) or uri.host == "" -> + {:error, Error.invalid_error("URL must include a host", %{error_code: :invalid_input})} + + not ascii_only?(uri.host) -> + {:error, + Error.invalid_error("Web fetch only accepts ASCII hostnames", %{ + error_code: :url_not_allowed, + host: uri.host + })} + + true -> + {:ok, URI.to_string(uri), normalize_uri(uri)} + end + end + end + + defp validate_known_url(url, opts) do + known_urls = + opts[:known_urls] + |> List.wrap() + |> Enum.map(&normalize_known_url/1) + |> Enum.reject(&is_nil/1) + + if not Keyword.get(opts, :require_known_url, false) do + :ok + else + if url in known_urls do + :ok + else + {:error, + Error.invalid_error("Web fetch URL must already be present in tool context", %{ + error_code: :url_not_allowed, + url: url + })} + end + end + end + + defp validate_domain_filters(%URI{} = uri, opts) do + with {:ok, allowed_rules} <- normalize_domain_rules(opts[:allowed_domains]), + {:ok, blocked_rules} <- normalize_domain_rules(opts[:blocked_domains]) do + cond do + allowed_rules != [] and not Enum.any?(allowed_rules, &rule_matches?(&1, uri)) -> + {:error, + Error.invalid_error("URL is not permitted by allowed_domains", %{ + error_code: :url_not_allowed, + url: URI.to_string(uri) + })} + + blocked_rules != [] and Enum.any?(blocked_rules, &rule_matches?(&1, uri)) -> + {:error, + Error.invalid_error("URL is blocked by blocked_domains", %{ + error_code: :url_not_allowed, + url: URI.to_string(uri) + })} + + true -> + :ok + end + end + end + + defp normalize_domain_rules(nil), do: {:ok, []} + + defp normalize_domain_rules(rules) do + rules + |> List.wrap() + |> Enum.reduce_while({:ok, []}, fn rule, {:ok, acc} -> + case normalize_domain_rule(rule) do + {:ok, normalized} -> {:cont, {:ok, [normalized | acc]}} + {:error, reason} -> {:halt, {:error, reason}} + end + end) + |> case do + {:ok, normalized} -> {:ok, Enum.reverse(normalized)} + error -> error + end + end + + defp normalize_domain_rule(rule) when is_binary(rule) do + normalized = String.trim(rule) + + cond do + normalized == "" -> + {:error, Error.invalid_error("Domain rules cannot be empty", %{error_code: :invalid_input})} + + String.contains?(normalized, "://") -> + {:error, + Error.invalid_error("Domain rules must not include URL schemes", %{ + error_code: :invalid_input, + rule: normalized + })} + + true -> + uri = URI.parse("https://" <> normalized) + host = String.downcase(uri.host || "") + path = uri.path || "/" + + cond do + host == "" -> + {:error, + Error.invalid_error("Domain rule must include a host", %{error_code: :invalid_input, rule: normalized})} + + not ascii_only?(host) -> + {:error, + Error.invalid_error("Domain rules must use ASCII hosts", %{ + error_code: :invalid_input, + rule: normalized + })} + + true -> + {:ok, %{host: host, path: normalize_rule_path(path)}} + end + end + end + + defp normalize_domain_rule(rule) do + {:error, Error.invalid_error("Domain rule must be a string", %{error_code: :invalid_input, rule: rule})} + end + + defp rule_matches?(%{host: host, path: path}, %URI{host: uri_host} = uri) do + uri_host = String.downcase(uri_host || "") + request_path = normalize_rule_path(uri.path || "/") + + host_matches? = uri_host == host or String.ends_with?(uri_host, "." <> host) + path_matches? = path == "/" or String.starts_with?(request_path, path) + + host_matches? and path_matches? + end + + defp normalize_final_url(%Req.Request{url: %URI{} = uri}) do + normalized = normalize_uri(uri) + {:ok, URI.to_string(normalized), normalized} + end + + defp validate_http_status(%Req.Response{status: status}, _url) when status in 200..299, do: :ok + + defp validate_http_status(%Req.Response{status: 429}, _url) do + {:error, Error.adapter_error("Web fetch rate limited", %{error_code: :too_many_requests, status: 429})} + end + + defp validate_http_status(%Req.Response{status: status}, url) do + {:error, + Error.adapter_error("Web fetch returned an HTTP error", %{ + error_code: :url_not_accessible, + status: status, + url: url + })} + end + + defp parse_document(body) do + case Floki.parse_document(body) do + {:ok, document} -> + {:ok, document} + + {:error, reason} -> + {:error, Error.adapter_error("Failed to parse fetched HTML", %{error_code: :unavailable, reason: reason})} + end + end + + defp select_html(_document, body, nil), do: {:ok, body} + defp select_html(document, _body, ""), do: select_html(document, nil, nil) + + defp select_html(document, _body, selector) do + nodes = Floki.find(document, selector) + + if nodes == [] do + {:error, + Error.invalid_error("Selector did not match any elements in fetched HTML", %{ + error_code: :invalid_input, + selector: selector + })} + else + {:ok, Floki.raw_html(nodes)} + end + end + + defp extract_title(document) do + title = + document + |> Floki.find("title") + |> Floki.text(sep: " ") + |> String.trim() + |> blank_to_nil() + + {:ok, title} + end + + defp format_html(html, :html, _opts), do: {:ok, html} + + defp format_html(html, :text, _opts) do + with {:ok, fragment} <- parse_fragment(html) do + {:ok, fragment |> Floki.text(sep: "\n") |> String.trim()} + end + end + + defp format_html(html, :markdown, _opts) do + {:ok, Html2Markdown.convert(html) |> String.trim()} + rescue + error -> + {:error, + Error.adapter_error("Failed to convert fetched HTML to markdown", %{error_code: :unavailable, reason: error})} + end + + defp format_text(text, :text), do: {:ok, String.trim(text)} + defp format_text(text, :markdown), do: {:ok, String.trim(text)} + + defp format_text(_text, :html) do + {:error, + Error.invalid_error("HTML output is only supported for HTML content", %{ + error_code: :invalid_input + })} + end + + defp parse_fragment(html) do + case Floki.parse_fragment(html) do + {:ok, fragment} -> + {:ok, fragment} + + {:error, reason} -> + {:error, + Error.adapter_error("Failed to parse fetched HTML fragment", %{error_code: :unavailable, reason: reason})} + end + end + + defp maybe_filter_content(content, opts) do + case opts[:focus_terms] do + [] -> + {:ok, content, false, 0} + + terms -> + sections = split_sections(content) + downcased_terms = Enum.map(terms, &String.downcase/1) + + matching_indexes = + sections + |> Enum.with_index() + |> Enum.flat_map(fn {section, index} -> + lowered = String.downcase(section) + + if Enum.any?(downcased_terms, &String.contains?(lowered, &1)) do + [index] + else + [] + end + end) + + window = max(opts[:focus_window] || 0, 0) + + kept_indexes = + matching_indexes + |> Enum.flat_map(fn index -> (index - window)..(index + window) end) + |> Enum.filter(&(&1 >= 0 and &1 < length(sections))) + |> Enum.uniq() + |> Enum.sort() + + filtered_content = + kept_indexes + |> Enum.map(&Enum.at(sections, &1)) + |> Enum.reject(&(&1 == "")) + |> Enum.join("\n\n") + |> String.trim() + + {:ok, filtered_content, true, length(matching_indexes)} + end + end + + defp maybe_truncate(content, nil), do: {content, false, estimate_tokens(content)} + + defp maybe_truncate(content, max_content_tokens) when is_integer(max_content_tokens) and max_content_tokens > 0 do + original_estimated_tokens = estimate_tokens(content) + + if original_estimated_tokens <= max_content_tokens do + {content, false, original_estimated_tokens} + else + char_limit = max_content_tokens * 4 + truncated = String.slice(content, 0, char_limit) |> String.trim() + {truncated, true, original_estimated_tokens} + end + end + + defp maybe_truncate(content, _other), do: {content, false, estimate_tokens(content)} + + defp maybe_build_passages(_content, _title, _url, false), do: [] + + defp maybe_build_passages(content, title, url, true) do + content + |> split_sections() + |> Enum.reject(&(&1 == "")) + |> Enum.reduce({[], 0, 0}, fn section, {passages, cursor, index} -> + start_char = cursor + end_char = start_char + String.length(section) + + passage = %{ + index: index, + start_char: start_char, + end_char: end_char, + text: section, + title: title, + url: url + } + + {[passage | passages], end_char + 2, index + 1} + end) + |> elem(0) + |> Enum.reverse() + |> Enum.take(50) + end + + defp split_sections(content) do + content + |> String.split(~r/\n\s*\n+/, trim: true) + |> case do + [] -> [String.trim(content)] + sections -> Enum.map(sections, &String.trim/1) + end + end + + defp extract_pdf_text(bytes) do + case pdftotext_path() do + nil -> + {:error, + Error.adapter_error("PDF extraction requires pdftotext to be installed", %{ + error_code: :unsupported_content_type, + content_type: "application/pdf" + })} + + binary -> + with_tmp_files("jido_browser_web_fetch", ".pdf", ".txt", fn pdf_path, txt_path -> + File.write!(pdf_path, bytes) + + case System.cmd(binary, ["-layout", "-nopgbrk", pdf_path, txt_path], stderr_to_stdout: true) do + {_output, 0} -> + case File.read(txt_path) do + {:ok, text} -> + {:ok, String.trim(text)} + + {:error, reason} -> + {:error, + Error.adapter_error("Failed to read extracted PDF text", %{error_code: :unavailable, reason: reason})} + end + + {output, status} -> + {:error, + Error.adapter_error("pdftotext failed while extracting PDF", %{ + error_code: :unavailable, + status: status, + output: output + })} + end + end) + end + end + + defp pdftotext_path do + config(:pdftotext_path) || System.find_executable("pdftotext") + end + + defp fetch_cached(url, opts) do + if opts[:cache] do + ensure_cache_table!() + now = System.system_time(:millisecond) + + case :ets.lookup(@cache_table, cache_key(url, opts)) do + [{_key, expires_at, result}] -> + if expires_at > now do + {:ok, Map.put(result, :cached, true)} + else + :ets.delete(@cache_table, cache_key(url, opts)) + :miss + end + + [] -> + :miss + end + else + :miss + end + end + + defp maybe_store_cache(url, opts, result) do + if opts[:cache] do + ensure_cache_table!() + + expires_at = System.system_time(:millisecond) + max(opts[:cache_ttl_ms], 0) + :ets.insert(@cache_table, {cache_key(url, opts), expires_at, result}) + end + + :ok + end + + defp ensure_cache_table! do + case :ets.whereis(@cache_table) do + :undefined -> + try do + :ets.new(@cache_table, [:named_table, :set, :public, read_concurrency: true, write_concurrency: true]) + rescue + ArgumentError -> @cache_table + end + + table -> + table + end + end + + defp cache_key(url, opts) do + {:jido_browser_web_fetch, url, opts[:format], opts[:selector], opts[:allowed_domains], opts[:blocked_domains], + opts[:focus_terms], opts[:focus_window], opts[:max_content_tokens], opts[:citations]} + end + + defp request_headers do + [ + {"accept", "text/html,application/xhtml+xml,text/plain,application/pdf;q=0.9,*/*;q=0.1"}, + {"user-agent", user_agent()} + ] + end + + defp user_agent do + vsn = + case Application.spec(:jido_browser, :vsn) do + nil -> "dev" + value -> List.to_string(value) + end + + "jido_browser/#{vsn}" + end + + defp response_content_type(response) do + response + |> Req.Response.get_header("content-type") + |> List.first() + |> case do + nil -> infer_content_type(response.body) + content_type -> content_type |> String.split(";") |> hd() |> String.trim() |> String.downcase() + end + end + + defp infer_content_type(body) when is_binary(body) do + if String.starts_with?(body, "%PDF-") do + "application/pdf" + else + "text/plain" + end + end + + defp infer_content_type(_body), do: "application/octet-stream" + + defp text_content_type?(content_type) do + content_type in @text_content_types or String.starts_with?(content_type, "text/") + end + + defp retrieved_at do + DateTime.utc_now() + |> DateTime.truncate(:second) + |> DateTime.to_iso8601() + end + + defp estimate_tokens(content) when is_binary(content) do + div(String.length(content) + 3, 4) + end + + defp estimate_tokens(_content), do: 0 + + defp normalize_citations(%{enabled: enabled}), do: enabled == true + defp normalize_citations(enabled), do: enabled == true + + defp present_domain_rules?(rules), do: rules not in [nil, []] + + defp normalize_focus_terms(nil), do: [] + + defp normalize_focus_terms(terms) do + terms + |> List.wrap() + |> Enum.map(fn + term when is_binary(term) -> String.trim(term) + term -> to_string(term) + end) + |> Enum.reject(&(&1 == "")) + |> Enum.uniq() + end + + defp normalize_known_url(url) when is_binary(url) do + url + |> String.trim() + |> case do + "" -> nil + value -> value + end + end + + defp normalize_known_url(_), do: nil + + defp normalize_uri(%URI{} = uri) do + %{uri | host: String.downcase(uri.host || ""), fragment: nil} + end + + defp normalize_rule_path(nil), do: "/" + defp normalize_rule_path(""), do: "/" + defp normalize_rule_path(path), do: if(String.starts_with?(path, "/"), do: path, else: "/" <> path) + + defp title_from_url(url) do + path = URI.parse(url).path || "" + + case path do + "" -> nil + "/" -> nil + value -> value |> Path.basename() |> String.trim("/") |> blank_to_nil() + end + end + + defp blank_to_nil(nil), do: nil + defp blank_to_nil(""), do: nil + defp blank_to_nil(value), do: value + + defp ascii_only?(value) when is_binary(value) do + String.printable?(value) and String.match?(value, ~r/^[\x00-\x7F]+$/) + end + + defp config(key, default \\ nil) do + :jido_browser + |> Application.get_env(:web_fetch, []) + |> Keyword.get(key, default) + end + + defp with_tmp_files(prefix, first_suffix, second_suffix, fun) do + base = Path.join(System.tmp_dir!(), "#{prefix}_#{System.unique_integer([:positive])}") + first = base <> first_suffix + second = base <> second_suffix + + try do + fun.(first, second) + after + File.rm(first) + File.rm(second) + end + end +end diff --git a/mix.exs b/mix.exs index b7ce8ca..091d21a 100644 --- a/mix.exs +++ b/mix.exs @@ -69,6 +69,7 @@ defmodule Jido.Browser.MixProject do {:req, "~> 0.5"}, {:jason, "~> 1.4"}, {:uniq, "~> 0.6"}, + {:floki, "~> 0.38"}, {:html2markdown, "~> 0.3"}, # Dev/Test @@ -111,7 +112,8 @@ defmodule Jido.Browser.MixProject do Core: [ Jido.Browser, Jido.Browser.Session, - Jido.Browser.Plugin + Jido.Browser.Plugin, + Jido.Browser.WebFetch ], Adapters: [ Jido.Browser.Adapter, @@ -154,7 +156,8 @@ defmodule Jido.Browser.MixProject do "Content Extraction": [ Jido.Browser.Actions.Snapshot, Jido.Browser.Actions.Screenshot, - Jido.Browser.Actions.ExtractContent + Jido.Browser.Actions.ExtractContent, + Jido.Browser.Actions.WebFetch ], Advanced: [ Jido.Browser.Actions.Evaluate diff --git a/test/jido_browser/composite_actions_and_installer_test.exs b/test/jido_browser/composite_actions_and_installer_test.exs index dddf9d5..52b4633 100644 --- a/test/jido_browser/composite_actions_and_installer_test.exs +++ b/test/jido_browser/composite_actions_and_installer_test.exs @@ -5,6 +5,7 @@ defmodule Jido.Browser.CompositeActionsAndInstallerTest do alias Jido.Browser.Actions.ReadPage alias Jido.Browser.Actions.SearchWeb alias Jido.Browser.Actions.SnapshotUrl + alias Jido.Browser.Actions.WebFetch alias Jido.Browser.Installer alias Jido.Browser.Session @@ -176,6 +177,64 @@ defmodule Jido.Browser.CompositeActionsAndInstallerTest do end end + describe "WebFetch.run/2" do + test "passes provenance options through to the fetch API" do + expect(Jido.Browser, :web_fetch, fn "https://example.com/guide", opts -> + assert opts[:require_known_url] == true + assert "https://example.com/guide" in opts[:known_urls] + assert opts[:allowed_domains] == ["example.com"] + + {:ok, + %{ + url: "https://example.com/guide", + final_url: "https://example.com/guide", + title: "Guide", + content: "Fetched guide content", + format: :markdown, + content_type: "text/html", + document_type: :html, + retrieved_at: "2026-03-21T00:00:00Z", + estimated_tokens: 5, + original_estimated_tokens: 5, + truncated: false, + filtered: false, + focus_matches: 0, + cached: false, + citations: %{enabled: false}, + passages: [] + }} + end) + + context = %{skill_state: %{seen_urls: ["https://example.com/guide"], web_fetch_uses: 0}} + + assert {:ok, result} = + WebFetch.run( + %{ + url: "https://example.com/guide", + require_known_url: true, + allowed_domains: ["example.com"] + }, + context + ) + + assert result.status == "success" + assert result.url == "https://example.com/guide" + end + + test "returns max_uses_exceeded before calling the fetch API" do + context = %{skill_state: %{web_fetch_uses: 2}} + + assert {:error, error} = + WebFetch.run( + %{url: "https://example.com/guide", max_uses: 2}, + context + ) + + assert %Jido.Browser.Error.InvalidError{} = error + assert error.details.error_code == :max_uses_exceeded + end + end + describe "Installer" do test "target returns a supported platform atom" do assert Installer.target() in [ diff --git a/test/jido_browser/plugin_test.exs b/test/jido_browser/plugin_test.exs index 8636afd..ef16ef9 100644 --- a/test/jido_browser/plugin_test.exs +++ b/test/jido_browser/plugin_test.exs @@ -23,9 +23,9 @@ defmodule Jido.Browser.PluginTest do assert "automation" in tags end - test "has 37 actions" do + test "has 38 actions" do actions = Plugin.actions() - assert length(actions) == 37 + assert length(actions) == 38 end test "includes all expected action modules" do @@ -73,13 +73,14 @@ defmodule Jido.Browser.PluginTest do # Advanced assert Jido.Browser.Actions.Evaluate in actions + assert Jido.Browser.Actions.WebFetch in actions end end describe "signal_routes/1" do - test "returns 37 routes" do + test "returns 38 routes" do routes = Plugin.signal_routes(%{}) - assert length(routes) == 37 + assert length(routes) == 38 end test "maps browser.navigate to Navigate action" do @@ -122,6 +123,8 @@ defmodule Jido.Browser.PluginTest do assert state.adapter == Jido.Browser.Adapters.AgentBrowser assert state.last_url == nil assert state.last_title == nil + assert state.seen_urls == [] + assert state.web_fetch_uses == 0 end test "accepts headless config override" do @@ -154,7 +157,7 @@ defmodule Jido.Browser.PluginTest do test "returns list of signal patterns" do patterns = Plugin.signal_patterns() assert is_list(patterns) - assert length(patterns) == 37 + assert length(patterns) == 38 end test "all patterns have browser. prefix" do @@ -173,6 +176,7 @@ defmodule Jido.Browser.PluginTest do assert "browser.save_state" in patterns assert "browser.tab_list" in patterns assert "browser.console" in patterns + assert "browser.web_fetch" in patterns end end @@ -188,6 +192,36 @@ defmodule Jido.Browser.PluginTest do assert Plugin.transform_result(:some_action, result, %{}) == result end + test "tracks discovered URLs and fetch usage for web fetch results" do + context = %{skill_state: %{seen_urls: ["https://seed.example"], web_fetch_uses: 1}} + + result = + Plugin.transform_result( + Jido.Browser.Actions.WebFetch, + {:ok, %{url: "https://example.com", final_url: "https://example.com/final", status: "success"}}, + context + ) + + assert {:ok, _result, state_updates} = result + + assert Enum.sort(state_updates.seen_urls) == + Enum.sort(["https://seed.example", "https://example.com", "https://example.com/final"]) + + assert state_updates.web_fetch_uses == 2 + end + + test "tracks URLs returned by search results" do + result = + Plugin.transform_result( + Jido.Browser.Actions.SearchWeb, + {:ok, %{results: [%{url: "https://elixir-lang.org"}]}}, + %{skill_state: %{}} + ) + + assert {:ok, _result, state_updates} = result + assert state_updates.seen_urls == ["https://elixir-lang.org"] + end + test "enhances error results when session available" do context = %{ skill_state: %{ diff --git a/test/jido_browser/web_fetch_test.exs b/test/jido_browser/web_fetch_test.exs new file mode 100644 index 0000000..c871bd0 --- /dev/null +++ b/test/jido_browser/web_fetch_test.exs @@ -0,0 +1,166 @@ +defmodule Jido.Browser.WebFetchTest do + use ExUnit.Case, async: false + use Mimic + + alias Jido.Browser.Error + alias Jido.Browser.WebFetch + + setup :set_mimic_global + + setup_all do + Mimic.copy(Req) + :ok + end + + setup do + WebFetch.clear_cache() + :ok + end + + describe "web_fetch/2" do + test "fetches HTML content with selector extraction and citation passages" do + expect(Req, :run, fn opts -> + assert opts[:url] == "https://example.com/article" + + request = Req.Request.new(url: "https://example.com/article") + + response = + %Req.Response{ + status: 200, + headers: %{"content-type" => ["text/html; charset=utf-8"]}, + body: """ + + Example Article + + +
+

Hello

+

Alpha paragraph.

+

Beta paragraph.

+
+ + + """ + } + + {request, response} + end) + + assert {:ok, result} = + Jido.Browser.web_fetch( + "https://example.com/article", + selector: "main", + format: :markdown, + citations: true + ) + + assert result.title == "Example Article" + assert result.document_type == :html + assert result.format == :markdown + assert result.content =~ "Hello" + assert result.content =~ "Alpha paragraph." + assert result.cached == false + assert result.citations.enabled == true + assert [%{start_char: 0, text: passage_text} | _] = result.passages + assert passage_text =~ "Hello" + end + + test "applies focused filtering to plain text responses" do + expect(Req, :run, fn opts -> + request = Req.Request.new(url: opts[:url]) + + response = + %Req.Response{ + status: 200, + headers: %{"content-type" => ["text/plain"]}, + body: """ + Intro section + + The relevant paragraph mentions Elixir and OTP. + + Closing section + """ + } + + {request, response} + end) + + assert {:ok, result} = + Jido.Browser.web_fetch( + "https://example.com/notes.txt", + format: :text, + focus_terms: ["elixir"] + ) + + assert result.filtered == true + assert result.focus_matches == 1 + assert result.content =~ "relevant paragraph" + refute result.content =~ "Intro section" + end + + test "rejects URLs outside allowed_domains" do + assert {:error, %Error.InvalidError{details: %{error_code: :url_not_allowed}}} = + Jido.Browser.web_fetch( + "https://example.com/private", + allowed_domains: ["docs.example.com"] + ) + end + + test "enforces known URL provenance when requested" do + assert {:error, %Error.InvalidError{details: %{error_code: :url_not_allowed}}} = + Jido.Browser.web_fetch( + "https://example.com/private", + require_known_url: true, + known_urls: ["https://example.com/public"] + ) + end + + test "caps returned content by approximate token budget" do + expect(Req, :run, fn opts -> + request = Req.Request.new(url: opts[:url]) + + response = + %Req.Response{ + status: 200, + headers: %{"content-type" => ["text/plain"]}, + body: String.duplicate("abcdef", 20) + } + + {request, response} + end) + + assert {:ok, result} = + Jido.Browser.web_fetch( + "https://example.com/large.txt", + format: :text, + max_content_tokens: 5 + ) + + assert result.truncated == true + assert result.original_estimated_tokens > 5 + assert result.estimated_tokens <= 5 + end + + test "reuses cached responses for identical requests" do + expect(Req, :run, fn opts -> + request = Req.Request.new(url: opts[:url]) + + response = + %Req.Response{ + status: 200, + headers: %{"content-type" => ["text/plain"]}, + body: "cached content" + } + + {request, response} + end) + + assert {:ok, first} = Jido.Browser.web_fetch("https://example.com/cache.txt", format: :text) + assert {:ok, second} = Jido.Browser.web_fetch("https://example.com/cache.txt", format: :text) + + assert first.cached == false + assert second.cached == true + assert first.content == second.content + end + end +end From 48c8632cab7000f0f080538da2553fce86d6ce3f Mon Sep 17 00:00:00 2001 From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com> Date: Sat, 21 Mar 2026 20:00:41 -0500 Subject: [PATCH 2/7] chore: remove changelog entry from web fetch PR --- CHANGELOG.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 99820c3..115277a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,10 +7,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -### Added - -- Add HTTP-first `Jido.Browser.web_fetch/2` and `Jido.Browser.Actions.WebFetch` for stateless page retrieval with domain policy, focused filtering, caching, and citation-ready passages - ### Changed - Rename the public Elixir namespace from `JidoBrowser.*` to `Jido.Browser.*` From f57bdd25538d4f0e2cbcfbd3db8d22284606d5b0 Mon Sep 17 00:00:00 2001 From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com> Date: Sat, 21 Mar 2026 20:01:51 -0500 Subject: [PATCH 3/7] chore: drop changelog newline diff --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 115277a..fb19c91 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -110,4 +110,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Refactoring: -* streamline agent-browser runtime defaults by mikehostetler +* streamline agent-browser runtime defaults by mikehostetler \ No newline at end of file From 5c674e69e2e543ea0ee2067c695a2e3badbd00ca Mon Sep 17 00:00:00 2001 From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com> Date: Sat, 21 Mar 2026 20:15:02 -0500 Subject: [PATCH 4/7] feat: use extractous for web fetch documents --- README.md | 7 +- lib/jido_browser/actions/web_fetch.ex | 8 +- lib/jido_browser/web_fetch.ex | 378 +++++++++++++++++++------- mix.exs | 1 + mix.lock | 4 + test/jido_browser/web_fetch_test.exs | 97 +++++++ 6 files changed, 387 insertions(+), 108 deletions(-) diff --git a/README.md b/README.md index c1d1808..590898a 100644 --- a/README.md +++ b/README.md @@ -94,6 +94,8 @@ result.content result.passages ``` +`web_fetch/2` keeps HTML handling native for selector extraction and markdown conversion, and uses `extractous_ex` for fetched binary documents such as PDFs, Word, Excel, PowerPoint, OpenDocument, EPUB, and common email formats. + ### State Persistence ```elixir @@ -164,7 +166,10 @@ Optional web fetch settings: ```elixir config :jido_browser, :web_fetch, cache_ttl_ms: 300_000, - pdftotext_path: "/usr/local/bin/pdftotext" + extractous: [ + pdf: [extract_annotation_text: true], + office: [include_headers_and_footers: true] + ] ``` ## Backends diff --git a/lib/jido_browser/actions/web_fetch.ex b/lib/jido_browser/actions/web_fetch.ex index 417f07d..7efcaae 100644 --- a/lib/jido_browser/actions/web_fetch.ex +++ b/lib/jido_browser/actions/web_fetch.ex @@ -1,17 +1,17 @@ defmodule Jido.Browser.Actions.WebFetch do @moduledoc """ - Stateless HTTP-first page retrieval for agent workflows. + Stateless HTTP-first document retrieval for agent workflows. `WebFetch` is a lighter-weight alternative to browser navigation when the target content can be retrieved over plain HTTP(S) without JavaScript - execution. + execution, including fetched PDFs and office-style documents. """ use Jido.Action, name: "web_fetch", description: - "Fetch a URL over HTTP(S) with domain policy controls, optional focused filtering, " <> - "approximate token caps, and citation-ready passages.", + "Fetch a URL over HTTP(S) with domain policy controls, Extractous-backed document extraction, " <> + "optional focused filtering, approximate token caps, and citation-ready passages.", category: "Browser", tags: ["browser", "web", "fetch", "http", "retrieval"], vsn: "2.0.0", diff --git a/lib/jido_browser/web_fetch.ex b/lib/jido_browser/web_fetch.ex index 8599598..d94b51d 100644 --- a/lib/jido_browser/web_fetch.ex +++ b/lib/jido_browser/web_fetch.ex @@ -1,7 +1,8 @@ defmodule Jido.Browser.WebFetch do @moduledoc """ Stateless HTTP-first web retrieval with optional domain policy, caching, - focused filtering, and citation-ready passage metadata. + focused filtering, citation-ready passage metadata, and Extractous-backed + document extraction. This module is intended for document retrieval workloads where starting a full browser session would be unnecessary or too expensive. @@ -16,8 +17,64 @@ defmodule Jido.Browser.WebFetch do @default_max_url_length 2_048 @supported_formats [:markdown, :text, :html] @html_content_types ["text/html", "application/xhtml+xml"] - @text_content_types ["text/plain", "text/markdown", "text/csv", "text/xml", "application/xml"] - @pdf_content_types ["application/pdf"] + @text_content_types [ + "text/plain", + "text/markdown", + "text/csv", + "text/xml", + "application/xml", + "application/json", + "application/ld+json" + ] + @document_content_types %{ + "application/pdf" => :pdf, + "application/msword" => :word_processing, + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => :word_processing, + "application/vnd.ms-word.document.macroenabled.12" => :word_processing, + "application/vnd.openxmlformats-officedocument.wordprocessingml.template" => :word_processing, + "application/vnd.ms-word.template.macroenabled.12" => :word_processing, + "application/vnd.ms-excel" => :spreadsheet, + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => :spreadsheet, + "application/vnd.ms-excel.sheet.macroenabled.12" => :spreadsheet, + "application/vnd.openxmlformats-officedocument.spreadsheetml.template" => :spreadsheet, + "application/vnd.ms-excel.template.macroenabled.12" => :spreadsheet, + "application/vnd.ms-powerpoint" => :presentation, + "application/vnd.openxmlformats-officedocument.presentationml.presentation" => :presentation, + "application/vnd.ms-powerpoint.presentation.macroenabled.12" => :presentation, + "application/vnd.openxmlformats-officedocument.presentationml.slideshow" => :presentation, + "application/vnd.openxmlformats-officedocument.presentationml.template" => :presentation, + "application/vnd.oasis.opendocument.text" => :word_processing, + "application/vnd.oasis.opendocument.spreadsheet" => :spreadsheet, + "application/vnd.oasis.opendocument.presentation" => :presentation, + "application/rtf" => :word_processing, + "text/rtf" => :word_processing, + "application/epub+zip" => :ebook, + "message/rfc822" => :email, + "application/vnd.ms-outlook" => :email + } + @document_extensions %{ + "pdf" => :pdf, + "doc" => :word_processing, + "docx" => :word_processing, + "docm" => :word_processing, + "dotx" => :word_processing, + "dotm" => :word_processing, + "odt" => :word_processing, + "rtf" => :word_processing, + "xls" => :spreadsheet, + "xlsx" => :spreadsheet, + "xlsm" => :spreadsheet, + "xlsb" => :spreadsheet, + "ods" => :spreadsheet, + "ppt" => :presentation, + "pptx" => :presentation, + "pptm" => :presentation, + "ppsx" => :presentation, + "odp" => :presentation, + "epub" => :ebook, + "eml" => :email, + "msg" => :email + } @type result :: %{ required(:url) => String.t(), @@ -35,7 +92,8 @@ defmodule Jido.Browser.WebFetch do required(:cached) => boolean(), required(:citations) => %{enabled: boolean()}, required(:passages) => list(map()), - optional(:title) => String.t() | nil + optional(:title) => String.t() | nil, + optional(:metadata) => map() } @doc """ @@ -53,6 +111,7 @@ defmodule Jido.Browser.WebFetch do - `:cache` - enable ETS cache, defaults to `true` - `:cache_ttl_ms` - cache TTL in milliseconds - `:require_known_url` / `:known_urls` - optional URL provenance guard + - `:extractous` - optional `ExtractousEx` keyword options merged with config """ @spec fetch(String.t(), keyword()) :: {:ok, result()} | {:error, Exception.t()} def fetch(url, opts \\ []) @@ -125,13 +184,14 @@ defmodule Jido.Browser.WebFetch do defp build_result(url, final_url, response, opts) do content_type = response_content_type(response) + document_type = extractable_document_type(content_type, final_url, response.body) cond do content_type in @html_content_types -> build_html_result(url, final_url, response.body, content_type, opts) - content_type in @pdf_content_types -> - build_pdf_result(url, final_url, response.body, content_type, opts) + not is_nil(document_type) -> + build_document_result(url, final_url, response.body, content_type, document_type, opts) text_content_type?(content_type) -> build_text_result(url, final_url, response.body, content_type, opts) @@ -221,11 +281,11 @@ defmodule Jido.Browser.WebFetch do })} end - defp build_pdf_result(url, final_url, body, content_type, opts) when is_binary(body) do + defp build_document_result(url, final_url, body, content_type, document_type, opts) when is_binary(body) do cond do opts[:selector] -> {:error, - Error.invalid_error("Selector filtering is not supported for PDF content", %{ + Error.invalid_error("Selector filtering is only supported for HTML content", %{ error_code: :invalid_input, selector: opts[:selector], content_type: content_type @@ -233,14 +293,14 @@ defmodule Jido.Browser.WebFetch do opts[:format] == :html -> {:error, - Error.invalid_error("HTML output is not supported for PDF content", %{ + Error.invalid_error("HTML output is only supported for HTML content", %{ error_code: :invalid_input, format: :html, content_type: content_type })} true -> - with {:ok, text} <- extract_pdf_text(body), + with {:ok, text, metadata} <- extract_document_content(body, final_url, content_type, document_type, opts), {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(text, opts), {final_content, truncated, original_estimated_tokens} <- maybe_truncate(filtered_content, opts[:max_content_tokens]) do @@ -249,22 +309,23 @@ defmodule Jido.Browser.WebFetch do url, final_url, final_content, - title_from_url(final_url), + document_title(metadata, final_url), content_type, - :pdf, + document_type, opts, truncated, filtered, focus_matches, - original_estimated_tokens + original_estimated_tokens, + metadata )} end end end - defp build_pdf_result(_url, _final_url, body, content_type, _opts) do + defp build_document_result(_url, _final_url, body, content_type, _document_type, _opts) do {:error, - Error.adapter_error("Unexpected response body for PDF fetch", %{ + Error.adapter_error("Unexpected response body for document fetch", %{ error_code: :unavailable, content_type: content_type, body: body @@ -282,7 +343,8 @@ defmodule Jido.Browser.WebFetch do truncated, filtered, focus_matches, - original_estimated_tokens + original_estimated_tokens, + metadata \\ nil ) do passages = maybe_build_passages(content, title, final_url, opts[:citations]) @@ -304,6 +366,7 @@ defmodule Jido.Browser.WebFetch do citations: %{enabled: opts[:citations]}, passages: passages } + |> maybe_put_metadata(metadata) end defp normalize_opts(opts) do @@ -311,42 +374,46 @@ defmodule Jido.Browser.WebFetch do citations = normalize_citations(opts[:citations]) focus_terms = normalize_focus_terms(opts[:focus_terms]) - cond do - format not in @supported_formats -> - {:error, - Error.invalid_error("Unsupported web fetch format", %{ - error_code: :invalid_input, - format: format, - supported_formats: @supported_formats - })} + with {:ok, configured_extractous_opts} <- normalize_extractous_opts(config(:extractous, [])), + {:ok, request_extractous_opts} <- normalize_extractous_opts(Keyword.get(opts, :extractous, [])) do + cond do + format not in @supported_formats -> + {:error, + Error.invalid_error("Unsupported web fetch format", %{ + error_code: :invalid_input, + format: format, + supported_formats: @supported_formats + })} - present_domain_rules?(opts[:allowed_domains]) and present_domain_rules?(opts[:blocked_domains]) -> - {:error, - Error.invalid_error("Use either allowed_domains or blocked_domains, not both", %{ - error_code: :invalid_input - })} + present_domain_rules?(opts[:allowed_domains]) and present_domain_rules?(opts[:blocked_domains]) -> + {:error, + Error.invalid_error("Use either allowed_domains or blocked_domains, not both", %{ + error_code: :invalid_input + })} - format == :html and focus_terms != [] -> - {:error, - Error.invalid_error("Focused filtering is only supported for markdown and text output", %{ - error_code: :invalid_input, - format: format - })} + format == :html and focus_terms != [] -> + {:error, + Error.invalid_error("Focused filtering is only supported for markdown and text output", %{ + error_code: :invalid_input, + format: format + })} - true -> - normalized = - opts - |> Keyword.put(:format, format) - |> Keyword.put(:citations, citations) - |> Keyword.put(:focus_terms, focus_terms) - |> Keyword.put_new(:focus_window, 0) - |> Keyword.put_new(:timeout, config(:timeout, @default_timeout)) - |> Keyword.put_new(:max_redirects, @default_max_redirects) - |> Keyword.put_new(:cache, true) - |> Keyword.put_new(:cache_ttl_ms, config(:cache_ttl_ms, @default_cache_ttl_ms)) - |> Keyword.put_new(:known_urls, []) - - {:ok, normalized} + true -> + normalized = + opts + |> Keyword.put(:format, format) + |> Keyword.put(:citations, citations) + |> Keyword.put(:focus_terms, focus_terms) + |> Keyword.put(:extractous, merge_extractous_opts(configured_extractous_opts, request_extractous_opts)) + |> Keyword.put_new(:focus_window, 0) + |> Keyword.put_new(:timeout, config(:timeout, @default_timeout)) + |> Keyword.put_new(:max_redirects, @default_max_redirects) + |> Keyword.put_new(:cache, true) + |> Keyword.put_new(:cache_ttl_ms, config(:cache_ttl_ms, @default_cache_ttl_ms)) + |> Keyword.put_new(:known_urls, []) + + {:ok, normalized} + end end end @@ -694,44 +761,44 @@ defmodule Jido.Browser.WebFetch do end end - defp extract_pdf_text(bytes) do - case pdftotext_path() do - nil -> + defp extract_document_content(bytes, final_url, content_type, document_type, opts) do + case ExtractousEx.extract_from_bytes(bytes, opts[:extractous]) do + {:ok, %{content: content, metadata: metadata}} when is_binary(content) -> + {:ok, String.trim(content), normalize_metadata(metadata)} + + {:ok, %{content: content}} when is_binary(content) -> + {:ok, String.trim(content), %{}} + + {:ok, result} -> {:error, - Error.adapter_error("PDF extraction requires pdftotext to be installed", %{ - error_code: :unsupported_content_type, - content_type: "application/pdf" + Error.adapter_error("ExtractousEx returned an unexpected document payload", %{ + error_code: :unavailable, + url: final_url, + content_type: content_type, + document_type: document_type, + result: result })} - binary -> - with_tmp_files("jido_browser_web_fetch", ".pdf", ".txt", fn pdf_path, txt_path -> - File.write!(pdf_path, bytes) - - case System.cmd(binary, ["-layout", "-nopgbrk", pdf_path, txt_path], stderr_to_stdout: true) do - {_output, 0} -> - case File.read(txt_path) do - {:ok, text} -> - {:ok, String.trim(text)} - - {:error, reason} -> - {:error, - Error.adapter_error("Failed to read extracted PDF text", %{error_code: :unavailable, reason: reason})} - end - - {output, status} -> - {:error, - Error.adapter_error("pdftotext failed while extracting PDF", %{ - error_code: :unavailable, - status: status, - output: output - })} - end - end) + {:error, reason} -> + {:error, + Error.adapter_error("ExtractousEx failed while extracting document content", %{ + error_code: :unavailable, + url: final_url, + content_type: content_type, + document_type: document_type, + reason: reason + })} end - end - - defp pdftotext_path do - config(:pdftotext_path) || System.find_executable("pdftotext") + rescue + error -> + {:error, + Error.adapter_error("ExtractousEx failed while extracting document content", %{ + error_code: :unavailable, + url: final_url, + content_type: content_type, + document_type: document_type, + reason: error + })} end defp fetch_cached(url, opts) do @@ -783,12 +850,16 @@ defmodule Jido.Browser.WebFetch do defp cache_key(url, opts) do {:jido_browser_web_fetch, url, opts[:format], opts[:selector], opts[:allowed_domains], opts[:blocked_domains], - opts[:focus_terms], opts[:focus_window], opts[:max_content_tokens], opts[:citations]} + opts[:focus_terms], opts[:focus_window], opts[:max_content_tokens], opts[:citations], opts[:extractous]} end defp request_headers do [ - {"accept", "text/html,application/xhtml+xml,text/plain,application/pdf;q=0.9,*/*;q=0.1"}, + {"accept", + "text/html,application/xhtml+xml,text/plain,application/json,application/pdf," <> + "application/msword,application/vnd.openxmlformats-officedocument.wordprocessingml.document," <> + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet," <> + "application/vnd.openxmlformats-officedocument.presentationml.presentation,*/*;q=0.1"}, {"user-agent", user_agent()} ] end @@ -814,10 +885,15 @@ defmodule Jido.Browser.WebFetch do end defp infer_content_type(body) when is_binary(body) do - if String.starts_with?(body, "%PDF-") do - "application/pdf" - else - "text/plain" + cond do + String.starts_with?(body, "%PDF-") -> + "application/pdf" + + likely_text?(body) -> + "text/plain" + + true -> + "application/octet-stream" end end @@ -857,6 +933,38 @@ defmodule Jido.Browser.WebFetch do |> Enum.uniq() end + defp normalize_extractous_opts(nil), do: {:ok, []} + + defp normalize_extractous_opts(opts) when is_list(opts) do + if Keyword.keyword?(opts) do + {:ok, opts} + else + {:error, + Error.invalid_error("Extractous options must be a keyword list", %{ + error_code: :invalid_input, + extractous: opts + })} + end + end + + defp normalize_extractous_opts(opts) do + {:error, + Error.invalid_error("Extractous options must be a keyword list", %{ + error_code: :invalid_input, + extractous: opts + })} + end + + defp merge_extractous_opts(left, right) do + Keyword.merge(left, right, fn _key, left_value, right_value -> + if Keyword.keyword?(left_value) and Keyword.keyword?(right_value) do + merge_extractous_opts(left_value, right_value) + else + right_value + end + end) + end + defp normalize_known_url(url) when is_binary(url) do url |> String.trim() @@ -876,6 +984,68 @@ defmodule Jido.Browser.WebFetch do defp normalize_rule_path(""), do: "/" defp normalize_rule_path(path), do: if(String.starts_with?(path, "/"), do: path, else: "/" <> path) + defp extractable_document_type(content_type, final_url, body) do + Map.get(@document_content_types, content_type) || + infer_document_type_from_body(body) || + if(ambiguous_binary_content_type?(content_type), do: infer_document_type_from_url(final_url), else: nil) + end + + defp infer_document_type_from_url(url) do + url + |> URI.parse() + |> Map.get(:path, "") + |> Path.extname() + |> String.trim_leading(".") + |> String.downcase() + |> case do + "" -> nil + extension -> Map.get(@document_extensions, extension) + end + end + + defp infer_document_type_from_body(body) when is_binary(body) do + if String.starts_with?(body, "%PDF-"), do: :pdf, else: nil + end + + defp infer_document_type_from_body(_body), do: nil + + defp document_title(metadata, url) do + metadata + |> metadata_title() + |> blank_to_nil() + |> case do + nil -> title_from_url(url) + title -> title + end + end + + defp metadata_title(metadata) when is_map(metadata) do + Enum.find_value([:title, "title", "dc:title", :"dc:title"], fn key -> + metadata + |> Map.get(key) + |> metadata_value_to_string() + |> blank_to_nil() + end) + end + + defp metadata_title(_metadata), do: nil + + defp metadata_value_to_string(nil), do: nil + defp metadata_value_to_string(value) when is_binary(value), do: String.trim(value) + + defp metadata_value_to_string(value) when is_list(value), + do: value |> Enum.map_join(" ", &to_string/1) |> String.trim() + + defp metadata_value_to_string(value) when is_atom(value), do: value |> Atom.to_string() |> String.trim() + defp metadata_value_to_string(value) when is_number(value), do: value |> to_string() |> String.trim() + defp metadata_value_to_string(_value), do: nil + + defp normalize_metadata(metadata) when is_map(metadata), do: metadata + defp normalize_metadata(_metadata), do: %{} + + defp maybe_put_metadata(response, metadata) when metadata in [%{}, nil], do: response + defp maybe_put_metadata(response, metadata), do: Map.put(response, :metadata, metadata) + defp title_from_url(url) do path = URI.parse(url).path || "" @@ -894,22 +1064,24 @@ defmodule Jido.Browser.WebFetch do String.printable?(value) and String.match?(value, ~r/^[\x00-\x7F]+$/) end - defp config(key, default \\ nil) do - :jido_browser - |> Application.get_env(:web_fetch, []) - |> Keyword.get(key, default) + defp ambiguous_binary_content_type?(content_type) do + content_type in [ + "application/octet-stream", + "binary/octet-stream", + "application/download", + "application/x-download", + "application/zip", + "application/x-zip-compressed" + ] end - defp with_tmp_files(prefix, first_suffix, second_suffix, fun) do - base = Path.join(System.tmp_dir!(), "#{prefix}_#{System.unique_integer([:positive])}") - first = base <> first_suffix - second = base <> second_suffix + defp likely_text?(body) when is_binary(body) do + String.valid?(body) and not String.contains?(body, <<0>>) + end - try do - fun.(first, second) - after - File.rm(first) - File.rm(second) - end + defp config(key, default) do + :jido_browser + |> Application.get_env(:web_fetch, []) + |> Keyword.get(key, default) end end diff --git a/mix.exs b/mix.exs index 091d21a..9fad805 100644 --- a/mix.exs +++ b/mix.exs @@ -71,6 +71,7 @@ defmodule Jido.Browser.MixProject do {:uniq, "~> 0.6"}, {:floki, "~> 0.38"}, {:html2markdown, "~> 0.3"}, + {:extractous_ex, "~> 0.2"}, # Dev/Test {:credo, "~> 1.7", only: [:dev, :test], runtime: false}, diff --git a/mix.lock b/mix.lock index aaf36a7..e91947c 100644 --- a/mix.lock +++ b/mix.lock @@ -1,6 +1,7 @@ %{ "abacus": {:hex, :abacus, "2.1.0", "b6db5c989ba3d9dd8c36d1cb269e2f0058f34768d47c67eb8ce06697ecb36dd4", [:mix], [], "hexpm", "255de08b02884e8383f1eed8aa31df884ce0fb5eb394db81ff888089f2a1bbff"}, "bunt": {:hex, :bunt, "1.0.0", "081c2c665f086849e6d57900292b3a161727ab40431219529f13c4ddcf3e7a44", [:mix], [], "hexpm", "dc5f86aa08a5f6fa6b8096f0735c4e76d54ae5c9fa2c143e5a1fc7c1cd9bb6b5"}, + "castore": {:hex, :castore, "1.0.18", "5e43ef0ec7d31195dfa5a65a86e6131db999d074179d2ba5a8de11fe14570f55", [:mix], [], "hexpm", "f393e4fe6317829b158fb74d86eb681f737d2fe326aa61ccf6293c4104957e34"}, "certifi": {:hex, :certifi, "2.15.0", "0e6e882fcdaaa0a5a9f2b3db55b1394dba07e8d6d9bcad08318fb604c6839712", [:rebar3], [], "hexpm", "b147ed22ce71d72eafdad94f055165c1c182f61a2ff49df28bcc71d1d5b94a60"}, "credo": {:hex, :credo, "1.7.17", "f92b6aa5b26301eaa5a35e4d48ebf5aa1e7094ac00ae38f87086c562caf8a22f", [:mix], [{:bunt, "~> 0.2.1 or ~> 1.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2 or ~> 1.0", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "1eb5645c835f0b6c9b5410f94b5a185057bcf6d62a9c2b476da971cde8749645"}, "crontab": {:hex, :crontab, "1.2.0", "503611820257939d5d0fd272eb2b454f48a470435a809479ddc2c40bb515495c", [:mix], [{:ecto, "~> 1.0 or ~> 2.0 or ~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm", "ebd7ef4d831e1b20fa4700f0de0284a04cac4347e813337978e25b4cc5cc2207"}, @@ -12,6 +13,7 @@ "erlex": {:hex, :erlex, "0.2.8", "cd8116f20f3c0afe376d1e8d1f0ae2452337729f68be016ea544a72f767d9c12", [:mix], [], "hexpm", "9d66ff9fedf69e49dc3fd12831e12a8a37b76f8651dd21cd45fcf5561a8a7590"}, "ex_doc": {:hex, :ex_doc, "0.40.1", "67542e4b6dde74811cfd580e2c0149b78010fd13001fda7cfeb2b2c2ffb1344d", [:mix], [{:earmark_parser, "~> 1.4.44", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "bcef0e2d360d93ac19f01a85d58f91752d930c0a30e2681145feea6bd3516e00"}, "excoveralls": {:hex, :excoveralls, "0.18.5", "e229d0a65982613332ec30f07940038fe451a2e5b29bce2a5022165f0c9b157e", [:mix], [{:castore, "~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "523fe8a15603f86d64852aab2abe8ddbd78e68579c8525ae765facc5eae01562"}, + "extractous_ex": {:hex, :extractous_ex, "0.2.1", "c9f7fd58b1d3b0d7eda9e219b1ed534a5b25e485884405d3ceee878e67248df2", [:mix], [{:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}, {:rustler, "~> 0.37", [hex: :rustler, repo: "hexpm", optional: false]}, {:rustler_precompiled, "~> 0.7", [hex: :rustler_precompiled, repo: "hexpm", optional: false]}], "hexpm", "8c1a3c74105448545a8478c3610fc920b2da418d47eae656853dc3e881adebd0"}, "file_system": {:hex, :file_system, "1.1.1", "31864f4685b0148f25bd3fbef2b1228457c0c89024ad67f7a81a3ffbc0bbad3a", [:mix], [], "hexpm", "7a15ff97dfe526aeefb090a7a9d3d03aa907e100e262a0f8f7746b78f8f87a5d"}, "finch": {:hex, :finch, "0.21.0", "b1c3b2d48af02d0c66d2a9ebfb5622be5c5ecd62937cf79a88a7f98d48a8290c", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "87dc6e169794cb2570f75841a19da99cfde834249568f2a5b121b809588a4377"}, "floki": {:hex, :floki, "0.38.0", "62b642386fa3f2f90713f6e231da0fa3256e41ef1089f83b6ceac7a3fd3abf33", [:mix], [], "hexpm", "a5943ee91e93fb2d635b612caf5508e36d37548e84928463ef9dd986f0d1abd9"}, @@ -51,6 +53,8 @@ "private": {:hex, :private, "0.1.2", "da4add9f36c3818a9f849840ca43016c8ae7f76d7a46c3b2510f42dcc5632932", [:mix], [], "hexpm", "22ee01c3f450cf8d135da61e10ec59dde006238fab1ea039014791fc8f3ff075"}, "recase": {:hex, :recase, "0.8.1", "ab98cd35857a86fa5ca99036f575241d71d77d9c2ab0c39aacf1c9b61f6f7d1d", [:mix], [], "hexpm", "9fd8d63e7e43bd9ea385b12364e305778b2bbd92537e95c4b2e26fc507d5e4c2"}, "req": {:hex, :req, "0.5.17", "0096ddd5b0ed6f576a03dde4b158a0c727215b15d2795e59e0916c6971066ede", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "0b8bc6ffdfebbc07968e59d3ff96d52f2202d0536f10fef4dc11dc02a2a43e39"}, + "rustler": {:hex, :rustler, "0.37.3", "5f4e6634d43b26f0a69834dd1d3ed4e1710b022a053bf4a670220c9540c92602", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "a6872c6f53dcf00486d1e7f9e046e20e01bf1654bdacc4193016c2e8002b32a2"}, + "rustler_precompiled": {:hex, :rustler_precompiled, "0.8.4", "700a878312acfac79fb6c572bb8b57f5aae05fe1cf70d34b5974850bbf2c05bf", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "3b33d99b540b15f142ba47944f7a163a25069f6d608783c321029bc1ffb09514"}, "splode": {:hex, :splode, "0.3.0", "ff8effecc509a51245df2f864ec78d849248647c37a75886033e3b1a53ca9470", [:mix], [], "hexpm", "73cfd0892d7316d6f2c93e6e8784bd6e137b2aa38443de52fd0a25171d106d81"}, "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"}, "telemetry": {:hex, :telemetry, "1.4.1", "ab6de178e2b29b58e8256b92b382ea3f590a47152ca3651ea857a6cae05ac423", [:rebar3], [], "hexpm", "2172e05a27531d3d31dd9782841065c50dd5c3c7699d95266b2edd54c2dafa1c"}, diff --git a/test/jido_browser/web_fetch_test.exs b/test/jido_browser/web_fetch_test.exs index c871bd0..43aad25 100644 --- a/test/jido_browser/web_fetch_test.exs +++ b/test/jido_browser/web_fetch_test.exs @@ -9,6 +9,7 @@ defmodule Jido.Browser.WebFetchTest do setup_all do Mimic.copy(Req) + Mimic.copy(ExtractousEx) :ok end @@ -98,6 +99,102 @@ defmodule Jido.Browser.WebFetchTest do refute result.content =~ "Intro section" end + test "extracts PDF content through ExtractousEx and preserves metadata" do + pdf_bytes = "%PDF-1.7 fake" + + expect(Req, :run, fn opts -> + request = Req.Request.new(url: opts[:url]) + + response = + %Req.Response{ + status: 200, + headers: %{"content-type" => ["application/pdf"]}, + body: pdf_bytes + } + + {request, response} + end) + + expect(ExtractousEx, :extract_from_bytes, fn ^pdf_bytes, opts -> + assert opts == [] + + {:ok, + %{ + content: "Extracted PDF body", + metadata: %{"title" => "Quarterly Report", "author" => "Ops"} + }} + end) + + assert {:ok, result} = + Jido.Browser.web_fetch( + "https://example.com/reports/q1.pdf", + format: :text, + citations: true + ) + + assert result.title == "Quarterly Report" + assert result.document_type == :pdf + assert result.content_type == "application/pdf" + assert result.content == "Extracted PDF body" + assert result.metadata == %{"title" => "Quarterly Report", "author" => "Ops"} + assert result.citations.enabled == true + assert [%{text: "Extracted PDF body"}] = result.passages + end + + test "extracts office documents served as octet-stream based on file extension" do + docx_bytes = <<80, 75, 3, 4, 20, 0, 0, 0>> + + expect(Req, :run, fn opts -> + request = Req.Request.new(url: opts[:url]) + + response = + %Req.Response{ + status: 200, + headers: %{"content-type" => ["application/octet-stream"]}, + body: docx_bytes + } + + {request, response} + end) + + expect(ExtractousEx, :extract_from_bytes, fn ^docx_bytes, opts -> + assert opts == [] + {:ok, %{content: "DOCX body", metadata: %{}}} + end) + + assert {:ok, result} = + Jido.Browser.web_fetch("https://example.com/specs/design.docx", format: :markdown) + + assert result.title == "design.docx" + assert result.document_type == :word_processing + assert result.content_type == "application/octet-stream" + assert result.content == "DOCX body" + end + + test "returns an adapter error when ExtractousEx extraction fails" do + pdf_bytes = "%PDF-1.7 broken" + + expect(Req, :run, fn opts -> + request = Req.Request.new(url: opts[:url]) + + response = + %Req.Response{ + status: 200, + headers: %{"content-type" => ["application/pdf"]}, + body: pdf_bytes + } + + {request, response} + end) + + expect(ExtractousEx, :extract_from_bytes, fn ^pdf_bytes, [] -> + {:error, "parse failed"} + end) + + assert {:error, %Error.AdapterError{details: %{error_code: :unavailable, document_type: :pdf}}} = + Jido.Browser.web_fetch("https://example.com/broken.pdf", format: :text) + end + test "rejects URLs outside allowed_domains" do assert {:error, %Error.InvalidError{details: %{error_code: :url_not_allowed}}} = Jido.Browser.web_fetch( From 909aad680a0e5b0d63f159a0648ed2770f5b5de3 Mon Sep 17 00:00:00 2001 From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com> Date: Sat, 21 Mar 2026 20:17:46 -0500 Subject: [PATCH 5/7] refactor: harden web fetch normalization --- README.md | 5 +- lib/jido_browser/web_fetch.ex | 242 +++++++++++++++++---------- test/jido_browser/web_fetch_test.exs | 30 ++++ 3 files changed, 185 insertions(+), 92 deletions(-) diff --git a/README.md b/README.md index 590898a..f6f78bd 100644 --- a/README.md +++ b/README.md @@ -92,9 +92,10 @@ Selectors remain supported, but ref-based interaction is the preferred 2.0 flow: result.content result.passages +result.metadata ``` -`web_fetch/2` keeps HTML handling native for selector extraction and markdown conversion, and uses `extractous_ex` for fetched binary documents such as PDFs, Word, Excel, PowerPoint, OpenDocument, EPUB, and common email formats. +`web_fetch/2` keeps HTML handling native for selector extraction and markdown conversion, and uses `extractous_ex` for fetched binary documents such as PDFs, Word, Excel, PowerPoint, OpenDocument, EPUB, and common email formats. Binary document responses may also include `result.metadata` when extraction returns document metadata. ### State Persistence @@ -172,6 +173,8 @@ config :jido_browser, :web_fetch, ] ``` +Configured `extractous` options are merged with any per-call `extractous:` keyword options passed to `Jido.Browser.web_fetch/2`. + ## Backends ### AgentBrowser (Default) diff --git a/lib/jido_browser/web_fetch.ex b/lib/jido_browser/web_fetch.ex index d94b51d..5c5cf81 100644 --- a/lib/jido_browser/web_fetch.ex +++ b/lib/jido_browser/web_fetch.ex @@ -153,6 +153,7 @@ defmodule Jido.Browser.WebFetch do url: url, headers: request_headers(), receive_timeout: opts[:timeout], + decode_body: false, redirect: true, max_redirects: opts[:max_redirects] ] @@ -211,24 +212,8 @@ defmodule Jido.Browser.WebFetch do with {:ok, document} <- parse_document(body), {:ok, html} <- select_html(document, body, selector), {:ok, title} <- extract_title(document), - {:ok, content} <- format_html(html, opts[:format], opts), - {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts), - {final_content, truncated, original_estimated_tokens} <- - maybe_truncate(filtered_content, opts[:max_content_tokens]) do - {:ok, - build_response( - url, - final_url, - final_content, - title, - content_type, - :html, - opts, - truncated, - filtered, - focus_matches, - original_estimated_tokens - )} + {:ok, content} <- format_html(html, opts[:format], opts) do + finalize_result(url, final_url, content, title, content_type, :html, opts) end end @@ -242,33 +227,9 @@ defmodule Jido.Browser.WebFetch do end defp build_text_result(url, final_url, body, content_type, opts) when is_binary(body) do - if opts[:selector] do - {:error, - Error.invalid_error("Selector filtering is only supported for HTML content", %{ - error_code: :invalid_input, - selector: opts[:selector], - content_type: content_type - })} - else - with {:ok, content} <- format_text(body, opts[:format]), - {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts), - {final_content, truncated, original_estimated_tokens} <- - maybe_truncate(filtered_content, opts[:max_content_tokens]) do - {:ok, - build_response( - url, - final_url, - final_content, - nil, - content_type, - :text, - opts, - truncated, - filtered, - focus_matches, - original_estimated_tokens - )} - end + with :ok <- validate_non_html_options(content_type, opts), + {:ok, content} <- format_text(body, opts[:format]) do + finalize_result(url, final_url, content, nil, content_type, :text, opts) end end @@ -282,44 +243,18 @@ defmodule Jido.Browser.WebFetch do end defp build_document_result(url, final_url, body, content_type, document_type, opts) when is_binary(body) do - cond do - opts[:selector] -> - {:error, - Error.invalid_error("Selector filtering is only supported for HTML content", %{ - error_code: :invalid_input, - selector: opts[:selector], - content_type: content_type - })} - - opts[:format] == :html -> - {:error, - Error.invalid_error("HTML output is only supported for HTML content", %{ - error_code: :invalid_input, - format: :html, - content_type: content_type - })} - - true -> - with {:ok, text, metadata} <- extract_document_content(body, final_url, content_type, document_type, opts), - {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(text, opts), - {final_content, truncated, original_estimated_tokens} <- - maybe_truncate(filtered_content, opts[:max_content_tokens]) do - {:ok, - build_response( - url, - final_url, - final_content, - document_title(metadata, final_url), - content_type, - document_type, - opts, - truncated, - filtered, - focus_matches, - original_estimated_tokens, - metadata - )} - end + with :ok <- validate_non_html_options(content_type, opts), + {:ok, text, metadata} <- extract_document_content(body, final_url, content_type, document_type, opts) do + finalize_result( + url, + final_url, + text, + document_title(metadata, final_url), + content_type, + document_type, + opts, + metadata + ) end end @@ -344,7 +279,7 @@ defmodule Jido.Browser.WebFetch do filtered, focus_matches, original_estimated_tokens, - metadata \\ nil + metadata ) do passages = maybe_build_passages(content, title, final_url, opts[:citations]) @@ -369,13 +304,76 @@ defmodule Jido.Browser.WebFetch do |> maybe_put_metadata(metadata) end + defp finalize_result(url, final_url, content, title, content_type, document_type, opts, metadata \\ nil) do + with {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts), + {final_content, truncated, original_estimated_tokens} <- + maybe_truncate(filtered_content, opts[:max_content_tokens]) do + {:ok, + build_response( + url, + final_url, + final_content, + title, + content_type, + document_type, + opts, + truncated, + filtered, + focus_matches, + original_estimated_tokens, + metadata + )} + end + end + + defp validate_non_html_options(content_type, opts) do + cond do + opts[:selector] -> + {:error, + Error.invalid_error("Selector filtering is only supported for HTML content", %{ + error_code: :invalid_input, + selector: opts[:selector], + content_type: content_type + })} + + opts[:format] == :html -> + {:error, + Error.invalid_error("HTML output is only supported for HTML content", %{ + error_code: :invalid_input, + format: :html, + content_type: content_type + })} + + true -> + :ok + end + end + defp normalize_opts(opts) do format = opts[:format] || :markdown citations = normalize_citations(opts[:citations]) focus_terms = normalize_focus_terms(opts[:focus_terms]) with {:ok, configured_extractous_opts} <- normalize_extractous_opts(config(:extractous, [])), - {:ok, request_extractous_opts} <- normalize_extractous_opts(Keyword.get(opts, :extractous, [])) do + {:ok, request_extractous_opts} <- normalize_extractous_opts(Keyword.get(opts, :extractous, [])), + {:ok, selector} <- normalize_selector(opts[:selector]), + {:ok, focus_window} <- normalize_integer_opt(:focus_window, Keyword.get(opts, :focus_window, 0), min: 0), + {:ok, timeout} <- + normalize_integer_opt(:timeout, Keyword.get(opts, :timeout, config(:timeout, @default_timeout)), min: 1), + {:ok, max_redirects} <- + normalize_integer_opt(:max_redirects, Keyword.get(opts, :max_redirects, @default_max_redirects), min: 0), + {:ok, cache_ttl_ms} <- + normalize_integer_opt( + :cache_ttl_ms, + Keyword.get(opts, :cache_ttl_ms, config(:cache_ttl_ms, @default_cache_ttl_ms)), + min: 0 + ), + {:ok, max_content_tokens} <- + normalize_optional_integer_opt(:max_content_tokens, opts[:max_content_tokens], min: 1), + {:ok, max_url_length} <- normalize_optional_integer_opt(:max_url_length, opts[:max_url_length], min: 1), + {:ok, cache} <- normalize_boolean_opt(:cache, Keyword.get(opts, :cache, true)), + {:ok, require_known_url} <- + normalize_boolean_opt(:require_known_url, Keyword.get(opts, :require_known_url, false)) do cond do format not in @supported_formats -> {:error, @@ -402,14 +400,18 @@ defmodule Jido.Browser.WebFetch do normalized = opts |> Keyword.put(:format, format) + |> Keyword.put(:selector, selector) |> Keyword.put(:citations, citations) |> Keyword.put(:focus_terms, focus_terms) + |> Keyword.put(:focus_window, focus_window) + |> Keyword.put(:timeout, timeout) + |> Keyword.put(:max_redirects, max_redirects) + |> Keyword.put(:cache, cache) + |> Keyword.put(:cache_ttl_ms, cache_ttl_ms) + |> Keyword.put(:require_known_url, require_known_url) |> Keyword.put(:extractous, merge_extractous_opts(configured_extractous_opts, request_extractous_opts)) - |> Keyword.put_new(:focus_window, 0) - |> Keyword.put_new(:timeout, config(:timeout, @default_timeout)) - |> Keyword.put_new(:max_redirects, @default_max_redirects) - |> Keyword.put_new(:cache, true) - |> Keyword.put_new(:cache_ttl_ms, config(:cache_ttl_ms, @default_cache_ttl_ms)) + |> maybe_put(:max_content_tokens, max_content_tokens) + |> maybe_put(:max_url_length, max_url_length) |> Keyword.put_new(:known_urls, []) {:ok, normalized} @@ -937,7 +939,7 @@ defmodule Jido.Browser.WebFetch do defp normalize_extractous_opts(opts) when is_list(opts) do if Keyword.keyword?(opts) do - {:ok, opts} + {:ok, canonicalize_keyword_list(opts)} else {:error, Error.invalid_error("Extractous options must be a keyword list", %{ @@ -955,6 +957,62 @@ defmodule Jido.Browser.WebFetch do })} end + defp normalize_selector(nil), do: {:ok, nil} + + defp normalize_selector(selector) when is_binary(selector) do + selector + |> String.trim() + |> case do + "" -> {:ok, nil} + value -> {:ok, value} + end + end + + defp normalize_selector(selector) do + {:error, + Error.invalid_error("Selector must be a string", %{ + error_code: :invalid_input, + selector: selector + })} + end + + defp normalize_integer_opt(_name, value, min: min) when is_integer(value) and value >= min, do: {:ok, value} + + defp normalize_integer_opt(name, value, min: min) do + {:error, + Error.invalid_error("#{name} must be an integer greater than or equal to #{min}", %{ + error_code: :invalid_input, + option: name, + value: value + })} + end + + defp normalize_optional_integer_opt(_name, nil, _opts), do: {:ok, nil} + defp normalize_optional_integer_opt(name, value, opts), do: normalize_integer_opt(name, value, opts) + + defp normalize_boolean_opt(_name, value) when is_boolean(value), do: {:ok, value} + + defp normalize_boolean_opt(name, value) do + {:error, + Error.invalid_error("#{name} must be a boolean", %{ + error_code: :invalid_input, + option: name, + value: value + })} + end + + defp canonicalize_keyword_list(keyword_list) do + keyword_list + |> Enum.map(fn {key, value} = pair -> + if is_list(value) and Keyword.keyword?(value) do + {key, canonicalize_keyword_list(value)} + else + pair + end + end) + |> Enum.sort_by(fn {key, _value} -> to_string(key) end) + end + defp merge_extractous_opts(left, right) do Keyword.merge(left, right, fn _key, left_value, right_value -> if Keyword.keyword?(left_value) and Keyword.keyword?(right_value) do @@ -1045,6 +1103,8 @@ defmodule Jido.Browser.WebFetch do defp maybe_put_metadata(response, metadata) when metadata in [%{}, nil], do: response defp maybe_put_metadata(response, metadata), do: Map.put(response, :metadata, metadata) + defp maybe_put(opts, _key, nil), do: opts + defp maybe_put(opts, key, value), do: Keyword.put(opts, key, value) defp title_from_url(url) do path = URI.parse(url).path || "" diff --git a/test/jido_browser/web_fetch_test.exs b/test/jido_browser/web_fetch_test.exs index 43aad25..931ada8 100644 --- a/test/jido_browser/web_fetch_test.exs +++ b/test/jido_browser/web_fetch_test.exs @@ -22,6 +22,7 @@ defmodule Jido.Browser.WebFetchTest do test "fetches HTML content with selector extraction and citation passages" do expect(Req, :run, fn opts -> assert opts[:url] == "https://example.com/article" + assert opts[:decode_body] == false request = Req.Request.new(url: "https://example.com/article") @@ -66,6 +67,27 @@ defmodule Jido.Browser.WebFetchTest do assert passage_text =~ "Hello" end + test "preserves JSON responses as text content" do + expect(Req, :run, fn opts -> + assert opts[:decode_body] == false + request = Req.Request.new(url: opts[:url]) + + response = + %Req.Response{ + status: 200, + headers: %{"content-type" => ["application/json"]}, + body: ~s({"name":"jido","kind":"agent"}) + } + + {request, response} + end) + + assert {:ok, result} = Jido.Browser.web_fetch("https://example.com/data.json", format: :text) + + assert result.document_type == :text + assert result.content =~ ~s("name":"jido") + end + test "applies focused filtering to plain text responses" do expect(Req, :run, fn opts -> request = Req.Request.new(url: opts[:url]) @@ -203,6 +225,14 @@ defmodule Jido.Browser.WebFetchTest do ) end + test "rejects invalid direct API options early" do + assert {:error, %Error.InvalidError{details: %{option: :timeout, error_code: :invalid_input}}} = + Jido.Browser.web_fetch("https://example.com/notes.txt", timeout: 0) + + assert {:error, %Error.InvalidError{details: %{extractous: [:bad, :shape], error_code: :invalid_input}}} = + Jido.Browser.web_fetch("https://example.com/notes.txt", extractous: [:bad, :shape]) + end + test "enforces known URL provenance when requested" do assert {:error, %Error.InvalidError{details: %{error_code: :url_not_allowed}}} = Jido.Browser.web_fetch( From 002dba516f34b3ce82ad6d4c00d6624a7deba901 Mon Sep 17 00:00:00 2001 From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com> Date: Sat, 21 Mar 2026 20:25:08 -0500 Subject: [PATCH 6/7] docs: clarify web fetch API --- README.md | 2 +- lib/jido_browser.ex | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f6f78bd..81940c0 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ Selectors remain supported, but ref-based interaction is the preferred 2.0 flow: result.content result.passages -result.metadata +result.metadata # present when extraction returns document metadata ``` `web_fetch/2` keeps HTML handling native for selector extraction and markdown conversion, and uses `extractous_ex` for fetched binary documents such as PDFs, Word, Excel, PowerPoint, OpenDocument, EPUB, and common email formats. Binary document responses may also include `result.metadata` when extraction returns document metadata. diff --git a/lib/jido_browser.ex b/lib/jido_browser.ex index 91139ac..39a4424 100644 --- a/lib/jido_browser.ex +++ b/lib/jido_browser.ex @@ -109,7 +109,13 @@ defmodule Jido.Browser do end end - @doc "Fetches a URL over HTTP(S) without starting a browser session." + @doc """ + Fetches a URL over HTTP(S) without starting a browser session. + + HTML responses keep native selector extraction and format conversion, while + fetched binary documents such as PDFs and office files are extracted through + `ExtractousEx`. + """ @spec web_fetch(String.t(), keyword()) :: {:ok, map()} | {:error, term()} def web_fetch(url, opts \\ []) From 1d495fd9436b87c7f75d87bf98131dd3a5ce621d Mon Sep 17 00:00:00 2001 From: Mike Hostetler <84222+mikehostetler@users.noreply.github.com> Date: Sat, 21 Mar 2026 20:39:02 -0500 Subject: [PATCH 7/7] fix: resolve lint regressions --- lib/jido_browser/actions/web_fetch.ex | 5 +- lib/jido_browser/plugin.ex | 17 +- lib/jido_browser/web_fetch.ex | 277 +++++++++++++------------- 3 files changed, 142 insertions(+), 157 deletions(-) diff --git a/lib/jido_browser/actions/web_fetch.ex b/lib/jido_browser/actions/web_fetch.ex index 7efcaae..fa79361 100644 --- a/lib/jido_browser/actions/web_fetch.ex +++ b/lib/jido_browser/actions/web_fetch.ex @@ -40,11 +40,8 @@ defmodule Jido.Browser.Actions.WebFetch do {:ok, result} <- Jido.Browser.web_fetch(params.url, build_opts(params, context)) do {:ok, Map.put(result, :status, "success")} else - {:error, %_{} = error} -> + {:error, error} -> {:error, error} - - {:error, reason} -> - {:error, Error.adapter_error("Web fetch failed", %{reason: reason})} end end diff --git a/lib/jido_browser/plugin.ex b/lib/jido_browser/plugin.ex index 2a58bba..a99f841 100644 --- a/lib/jido_browser/plugin.ex +++ b/lib/jido_browser/plugin.ex @@ -285,7 +285,7 @@ defmodule Jido.Browser.Plugin do seen_urls = current_seen_urls |> Kernel.++(extract_urls(result)) - |> Enum.reject(&is_nil_or_empty/1) + |> Enum.reject(&nil_or_empty?/1) |> Enum.uniq() if seen_urls == [] or seen_urls == current_seen_urls do @@ -305,26 +305,23 @@ defmodule Jido.Browser.Plugin do defp extract_urls(result) do direct_urls = [Map.get(result, :url), Map.get(result, "url"), Map.get(result, :final_url), Map.get(result, "final_url")] - |> Enum.reject(&is_nil_or_empty/1) + |> Enum.reject(&nil_or_empty?/1) search_urls = result |> Map.get(:results, Map.get(result, "results", [])) |> List.wrap() |> Enum.map(fn item -> - cond do - is_map(item) -> Map.get(item, :url) || Map.get(item, "url") - true -> nil - end + if is_map(item), do: Map.get(item, :url) || Map.get(item, "url") end) - |> Enum.reject(&is_nil_or_empty/1) + |> Enum.reject(&nil_or_empty?/1) direct_urls ++ search_urls end - defp is_nil_or_empty(nil), do: true - defp is_nil_or_empty(""), do: true - defp is_nil_or_empty(_value), do: false + defp nil_or_empty?(nil), do: true + defp nil_or_empty?(""), do: true + defp nil_or_empty?(_value), do: false def signal_patterns do [ diff --git a/lib/jido_browser/web_fetch.ex b/lib/jido_browser/web_fetch.ex index 5c5cf81..1eae9aa 100644 --- a/lib/jido_browser/web_fetch.ex +++ b/lib/jido_browser/web_fetch.ex @@ -177,9 +177,6 @@ defmodule Jido.Browser.WebFetch do {_request, %_{} = exception} -> {:error, Error.adapter_error("Web fetch failed", %{error_code: :unavailable, reason: exception})} - - {_request, reason} -> - {:error, Error.adapter_error("Web fetch failed", %{error_code: :unavailable, reason: reason})} end end @@ -267,62 +264,49 @@ defmodule Jido.Browser.WebFetch do })} end - defp build_response( - url, - final_url, - content, - title, - content_type, - document_type, - opts, - truncated, - filtered, - focus_matches, - original_estimated_tokens, - metadata - ) do - passages = maybe_build_passages(content, title, final_url, opts[:citations]) + defp build_response(opts, attrs) do + passages = maybe_build_passages(attrs.content, attrs.title, attrs.final_url, opts[:citations]) %{ - url: url, - final_url: final_url, - title: title, - content: content, + url: attrs.url, + final_url: attrs.final_url, + title: attrs.title, + content: attrs.content, format: opts[:format], - content_type: content_type, - document_type: document_type, + content_type: attrs.content_type, + document_type: attrs.document_type, retrieved_at: retrieved_at(), - estimated_tokens: estimate_tokens(content), - original_estimated_tokens: original_estimated_tokens, - truncated: truncated, - filtered: filtered, - focus_matches: focus_matches, + estimated_tokens: estimate_tokens(attrs.content), + original_estimated_tokens: attrs.original_estimated_tokens, + truncated: attrs.truncated, + filtered: attrs.filtered, + focus_matches: attrs.focus_matches, cached: false, citations: %{enabled: opts[:citations]}, passages: passages } - |> maybe_put_metadata(metadata) + |> maybe_put_metadata(attrs.metadata) end defp finalize_result(url, final_url, content, title, content_type, document_type, opts, metadata \\ nil) do with {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts), {final_content, truncated, original_estimated_tokens} <- maybe_truncate(filtered_content, opts[:max_content_tokens]) do - {:ok, - build_response( - url, - final_url, - final_content, - title, - content_type, - document_type, - opts, - truncated, - filtered, - focus_matches, - original_estimated_tokens, - metadata - )} + attrs = %{ + url: url, + final_url: final_url, + content: final_content, + title: title, + content_type: content_type, + document_type: document_type, + truncated: truncated, + filtered: filtered, + focus_matches: focus_matches, + original_estimated_tokens: original_estimated_tokens, + metadata: metadata + } + + {:ok, build_response(opts, attrs)} end end @@ -423,41 +407,10 @@ defmodule Jido.Browser.WebFetch do normalized_url = String.trim(url) max_url_length = opts[:max_url_length] || @default_max_url_length - cond do - normalized_url == "" -> - {:error, Error.invalid_error("URL cannot be empty", %{error_code: :invalid_input})} - - String.length(normalized_url) > max_url_length -> - {:error, - Error.invalid_error("URL exceeds maximum length", %{ - error_code: :url_too_long, - max_url_length: max_url_length - })} - - true -> - uri = URI.parse(normalized_url) - - cond do - uri.scheme not in ["http", "https"] -> - {:error, - Error.invalid_error("Web fetch only supports http and https URLs", %{ - error_code: :invalid_input, - scheme: uri.scheme - })} - - is_nil(uri.host) or uri.host == "" -> - {:error, Error.invalid_error("URL must include a host", %{error_code: :invalid_input})} - - not ascii_only?(uri.host) -> - {:error, - Error.invalid_error("Web fetch only accepts ASCII hostnames", %{ - error_code: :url_not_allowed, - host: uri.host - })} - - true -> - {:ok, URI.to_string(uri), normalize_uri(uri)} - end + with :ok <- validate_url_length(normalized_url, max_url_length), + {:ok, uri} <- parse_fetch_uri(normalized_url), + :ok <- validate_uri_host(uri) do + {:ok, URI.to_string(uri), normalize_uri(uri)} end end @@ -468,9 +421,7 @@ defmodule Jido.Browser.WebFetch do |> Enum.map(&normalize_known_url/1) |> Enum.reject(&is_nil/1) - if not Keyword.get(opts, :require_known_url, false) do - :ok - else + if Keyword.get(opts, :require_known_url, false) do if url in known_urls do :ok else @@ -480,6 +431,8 @@ defmodule Jido.Browser.WebFetch do url: url })} end + else + :ok end end @@ -566,7 +519,7 @@ defmodule Jido.Browser.WebFetch do end defp rule_matches?(%{host: host, path: path}, %URI{host: uri_host} = uri) do - uri_host = String.downcase(uri_host || "") + uri_host = String.downcase(uri_host) request_path = normalize_rule_path(uri.path || "/") host_matches? = uri_host == host or String.ends_with?(uri_host, "." <> host) @@ -677,36 +630,10 @@ defmodule Jido.Browser.WebFetch do terms -> sections = split_sections(content) - downcased_terms = Enum.map(terms, &String.downcase/1) - - matching_indexes = - sections - |> Enum.with_index() - |> Enum.flat_map(fn {section, index} -> - lowered = String.downcase(section) - - if Enum.any?(downcased_terms, &String.contains?(lowered, &1)) do - [index] - else - [] - end - end) - + matching_indexes = matching_section_indexes(sections, terms) window = max(opts[:focus_window] || 0, 0) - - kept_indexes = - matching_indexes - |> Enum.flat_map(fn index -> (index - window)..(index + window) end) - |> Enum.filter(&(&1 >= 0 and &1 < length(sections))) - |> Enum.uniq() - |> Enum.sort() - - filtered_content = - kept_indexes - |> Enum.map(&Enum.at(sections, &1)) - |> Enum.reject(&(&1 == "")) - |> Enum.join("\n\n") - |> String.trim() + kept_indexes = expand_focus_window(matching_indexes, window, length(sections)) + filtered_content = render_section_slice(sections, kept_indexes) {:ok, filtered_content, true, length(matching_indexes)} end @@ -768,19 +695,6 @@ defmodule Jido.Browser.WebFetch do {:ok, %{content: content, metadata: metadata}} when is_binary(content) -> {:ok, String.trim(content), normalize_metadata(metadata)} - {:ok, %{content: content}} when is_binary(content) -> - {:ok, String.trim(content), %{}} - - {:ok, result} -> - {:error, - Error.adapter_error("ExtractousEx returned an unexpected document payload", %{ - error_code: :unavailable, - url: final_url, - content_type: content_type, - document_type: document_type, - result: result - })} - {:error, reason} -> {:error, Error.adapter_error("ExtractousEx failed while extracting document content", %{ @@ -806,25 +720,28 @@ defmodule Jido.Browser.WebFetch do defp fetch_cached(url, opts) do if opts[:cache] do ensure_cache_table!() - now = System.system_time(:millisecond) - - case :ets.lookup(@cache_table, cache_key(url, opts)) do - [{_key, expires_at, result}] -> - if expires_at > now do - {:ok, Map.put(result, :cached, true)} - else - :ets.delete(@cache_table, cache_key(url, opts)) - :miss - end - - [] -> - :miss - end + lookup_cached_result(cache_key(url, opts), System.system_time(:millisecond)) else :miss end end + defp lookup_cached_result(key, now) do + case :ets.lookup(@cache_table, key) do + [{_key, expires_at, result}] -> handle_cached_result(key, expires_at, result, now) + [] -> :miss + end + end + + defp handle_cached_result(_key, expires_at, result, now) when expires_at > now do + {:ok, Map.put(result, :cached, true)} + end + + defp handle_cached_result(key, _expires_at, _result, _now) do + :ets.delete(@cache_table, key) + :miss + end + defp maybe_store_cache(url, opts, result) do if opts[:cache] do ensure_cache_table!() @@ -976,6 +893,52 @@ defmodule Jido.Browser.WebFetch do })} end + defp validate_url_length("", _max_url_length) do + {:error, Error.invalid_error("URL cannot be empty", %{error_code: :invalid_input})} + end + + defp validate_url_length(normalized_url, max_url_length) do + if String.length(normalized_url) > max_url_length do + {:error, + Error.invalid_error("URL exceeds maximum length", %{ + error_code: :url_too_long, + max_url_length: max_url_length + })} + else + :ok + end + end + + defp parse_fetch_uri(normalized_url) do + uri = URI.parse(normalized_url) + + if uri.scheme in ["http", "https"] do + {:ok, uri} + else + {:error, + Error.invalid_error("Web fetch only supports http and https URLs", %{ + error_code: :invalid_input, + scheme: uri.scheme + })} + end + end + + defp validate_uri_host(%URI{host: host}) when host in [nil, ""] do + {:error, Error.invalid_error("URL must include a host", %{error_code: :invalid_input})} + end + + defp validate_uri_host(%URI{host: host}) do + if ascii_only?(host) do + :ok + else + {:error, + Error.invalid_error("Web fetch only accepts ASCII hostnames", %{ + error_code: :url_not_allowed, + host: host + })} + end + end + defp normalize_integer_opt(_name, value, min: min) when is_integer(value) and value >= min, do: {:ok, value} defp normalize_integer_opt(name, value, min: min) do @@ -1086,8 +1049,6 @@ defmodule Jido.Browser.WebFetch do end) end - defp metadata_title(_metadata), do: nil - defp metadata_value_to_string(nil), do: nil defp metadata_value_to_string(value) when is_binary(value), do: String.trim(value) @@ -1099,13 +1060,43 @@ defmodule Jido.Browser.WebFetch do defp metadata_value_to_string(_value), do: nil defp normalize_metadata(metadata) when is_map(metadata), do: metadata - defp normalize_metadata(_metadata), do: %{} defp maybe_put_metadata(response, metadata) when metadata in [%{}, nil], do: response defp maybe_put_metadata(response, metadata), do: Map.put(response, :metadata, metadata) defp maybe_put(opts, _key, nil), do: opts defp maybe_put(opts, key, value), do: Keyword.put(opts, key, value) + defp matching_section_indexes(sections, terms) do + downcased_terms = Enum.map(terms, &String.downcase/1) + + sections + |> Enum.with_index() + |> Enum.flat_map(fn {section, index} -> + if section_matches_term?(section, downcased_terms), do: [index], else: [] + end) + end + + defp section_matches_term?(section, downcased_terms) do + lowered = String.downcase(section) + Enum.any?(downcased_terms, &String.contains?(lowered, &1)) + end + + defp expand_focus_window(matching_indexes, window, section_count) do + matching_indexes + |> Enum.flat_map(fn index -> (index - window)..(index + window) end) + |> Enum.filter(&(&1 >= 0 and &1 < section_count)) + |> Enum.uniq() + |> Enum.sort() + end + + defp render_section_slice(sections, indexes) do + indexes + |> Enum.map(&Enum.at(sections, &1)) + |> Enum.reject(&(&1 == "")) + |> Enum.join("\n\n") + |> String.trim() + end + defp title_from_url(url) do path = URI.parse(url).path || ""